In [1]:
import numpy as np
import pandas as pd

In [2]:
movie_file = 'hw8_movies.txt'
rating_file = 'hw8_ratings.txt'
ids_file = 'hw8_ids.txt'

In [3]:
with open(movie_file) as f:
    movie = f.read().splitlines()

with open(rating_file) as f:
    rating = f.read().splitlines()
    rating = [s.replace(" ","") for s in rating]

with open(ids_file) as f:
    id = f.read().splitlines()

(a) Sanity Check

In [4]:
mean_popularity = np.zeros(76, dtype = float)
rate_num = np.zeros(76, dtype = float)
ratings = np.zeros([len(rating), len(movie)])

for count, n in enumerate(rating):
    for m in range(len(n)):
        if n[m] == '?':
            ratings[count][m] = -1
            continue
        elif n[m] == '1':
            mean_popularity[m] += 1
            ratings[count][m] = 1
        else:
            ratings[count][m] = 0
        rate_num[m] += 1
            
mean_popularity /= rate_num
mean_popularity.argsort()

array([52, 44, 74,  4, 27, 68, 57, 18, 60, 34, 41, 66, 53, 12, 36, 35, 75,
       46, 71, 73,  8, 42, 15, 72, 16, 24, 43, 30, 14, 49, 38, 45, 54, 17,
       56, 63, 59, 61, 29, 58, 32, 10, 13, 51, 55, 48,  9,  7, 22, 33,  6,
       19, 67, 25, 26, 64,  2, 70, 20, 69, 23, 31, 62, 50, 28, 65, 37,  5,
        1, 11, 40, 39, 47, 21,  3,  0])

In [5]:
for i in range(len(movie)):
    print(movie[mean_popularity.argsort()[i]])

I_Feel_Pretty
Fifty_Shades_of_Grey
Hustlers
The_Last_Airbender
Magic_Mike
Fast_&_Furious:_Hobbs_&_Shaw
The_Shape_of_Water
Prometheus
Phantom_Thread
World_War_Z
Star_Wars:_The_Force_Awakens
Rocketman
Chappaquidick
Bridemaids
Man_of_Steel
American_Hustle
Terminator:_Dark_Fate
Room
Good_Boys
Pokemon_Detective_Pikachu
Fast_Five
Mad_Max:_Fury_Road
Drive
Us
The_Help
Pitch_Perfect
Jurassic_World
Frozen
X-Men:_First_Class
The_Revenant
Ex_Machina
Avengers:_Age_of_Ultron
La_La_Land
Midnight_in_Paris
Manchester_by_the_Sea
Once_Upon_a_Time_in_Hollywood
Three_Billboards_Outside_Ebbing
Darkest_Hour
The_Great_Gatsby
Dunkirk
Her
Captain_America:_The_First_Avenger
The_Girls_with_the_Dragon_Tattoo
Ready_Player_One
Hidden_Figures
The_Hateful_Eight
Thor
Toy_Story_3
The_Hunger_Games
12_Years_a_Slave
Iron_Man_2
The_Perks_of_Being_a_Wallflower
Joker
Les_Miserables
21_Jump_Street
Spiderman:_Far_From_Home
Black_Swan
Parasite
The_Avengers
The_Farewell
Django_Unchained
Now_You_See_Me
Avengers:_Endgame
Avengers:_

In [6]:
k = 4
T = len(rating)

In [7]:
def estep_numerator(i,t,pr_given_z,pz):
    j_recommend = np.where(ratings[t,:] == 1)
    j_no_recommend = np.where(ratings[t,:] == 0)
    return pz[i]*np.prod(pr_given_z[j_recommend,i])*np.prod(1-pr_given_z[j_no_recommend,i])

def estep_denominator(t,pr_given_z,pz):
    denom = 0
    j_recommend = np.where(ratings[t,:] == 1)
    j_no_recommend = np.where(ratings[t,:] == 0)
    for i in range(k):
        denom += estep_numerator(i,t,pr_given_z,pz)
    return denom

def mstep_pr_givenZ(i, j, pz_given_r, pr_given_z):
    # pz_given_r is rho
    # pr_given_z is probR

    # j belongs to omega t
    t_seen_index = np.asarray(ratings[:,j] == 1).nonzero()
    sum_seen = np.sum(pz_given_r[i, t_seen_index])
    # j not belongs to omega t
    t_unseen_index = np.asarray(ratings[:,j] == -1).nonzero()
    sum_unseen = np.sum(pz_given_r[i, t_unseen_index]) * pr_given_z[j, i]
    return (sum_seen + sum_unseen) / np.sum(rho[i][:])

def logLL(pr_given_z,pz):
    L = 0
    for t in range(T):
        likelihood = 0
        for i in range(k):
            j_recommend = np.where(ratings[t,:] == 1)
            j_no_recommend = np.where(ratings[t,:] == 0)
            likelihood += pz[i] * np.prod(pr_given_z[j_recommend, i]) * np.prod(1-pr_given_z[j_no_recommend,i])
        L += np.log(likelihood)

    return L / T

In [None]:
probR = np.loadtxt('hw8_probR_init.txt')
probZ = np.loadtxt('hw8_probZ_init.txt')
rho = np.empty([k,T], dtype='float64')
pz_tmp = np.empty(k)
pr_given_Z_temp = np.empty([len(movie), k])
L = [] 

for iteration in range(256+1):
    L.append(logLL(probR, probZ))
    if iteration in {0,1,2,4,8,16,32,64,128,256}:
            print("iteration: %d, log-likelihood L: %.4f" % (iteration, L[iteration]))
    # E-step
    for t in range(T):
        for i in range(k):
            rho[i][t] = estep_numerator(i,t,probR,probZ) / estep_denominator(t,probR,probZ)

    # M-step
    for i in range(k):
        pz_tmp[i] = np.sum(rho[i][:]) / T
        for j in range(len(movie)):
            pr_given_Z_temp[j][i] = mstep_pr_givenZ(i,j,rho,probR)

    probZ = pz_tmp
    probR = pr_given_Z_temp

iteration: 0, log-likelihood L: -27.0358
iteration: 1, log-likelihood L: -17.5604
iteration: 2, log-likelihood L: -16.0024
iteration: 4, log-likelihood L: -15.0606
iteration: 8, log-likelihood L: -14.5016
iteration: 16, log-likelihood L: -14.2638
iteration: 32, log-likelihood L: -14.1802
iteration: 64, log-likelihood L: -14.1701
iteration: 128, log-likelihood L: -14.1640


In [None]:
PID = "A59010711"
idx = id.index(PID)

In [None]:
my_data = ratings[idx,:]
my_unseen = np.asarray(my_data == -1).nonzero()[0]
expected_ratings = []

for l in my_unseen:
    exp_rating = 0
    for i in range(k):
        estep_term = estep_numerator(i, idx, probR, probZ)/estep_denominator(idx, probR, probZ)
        mstep_term = mstep_pr_givenZ(i,l, rho, probR)
        exp_rating += estep_term * mstep_term
    expected_ratings.append((exp_rating, movie[l]))

expected_ratings.sort(reverse=True)   
pd.DataFrame(expected_ratings, columns=['Expected rating', 'Movie'])

In [None]:
PID = 'A59010711'
idx = id.index(PID)
my_data = ratings[idx,:] #my ratings
unseen = np.asarray(my_data == -1).nonzero()[0] #movies I haven't seen
expected_ratings = []

for l in unseen:
    exp_rating = 0
    for i in range(k):
        estep_term = estep_numerator(i,idx,probR,probZ) / estep_denominator(idx,probR,probZ)
        mstep_term = mstep_pr_givenZ(i,l,rho, probR) / np.sum(rho[i,:])
        exp_rating += estep_term * mstep_term
    expected_ratings.append((exp_rating, movie[l]))

expected_ratings.sort(reverse=True)   
pd.DataFrame(expected_ratings, columns=['Expected rating', 'Movie'])
#     print('Movie: %s \t Expected rating: %f' % (movieTitles[l], exp_rating))

import pandas as pd
pd.DataFrame(list(zip([movie[l] for l in unseen], expected_ratings)), columns=['Movie','Expected rating'])

