In [1]:
import numpy as np
import pandas as pd
# Load Input files

# Load Input files
studentIds = open("hw8_ids.txt").read().splitlines()
movieTitles = open("hw8_movies.txt").read().splitlines()
movieRatings = np.genfromtxt("hw8_ratings.txt", dtype="str")

In [2]:
movieMeanRatings = []
for i in range(len(movieTitles)):
    movieRatingsCol = movieRatings[:,i]
    numRecommended = (movieRatingsCol == "1").sum()
    numSeen = (movieRatingsCol != "?").sum()
    movieMeanRatings.append((numRecommended/numSeen, movieTitles[i]))
movieMeanRatings.sort(reverse=True)
for meanR, mTitle in movieMeanRatings:
    print(mTitle)

Inception
Shutter_Island
The_Dark_Knight_Rises
The_Martian
Interstellar
The_Theory_of_Everything
Harry_Potter_and_the_Deathly_Hallows:_Part_2
The_Social_Network
Harry_Potter_and_the_Deathly_Hallows:_Part_1
Gone_Girl
The_Lion_King
Wolf_of_Wall_Street
Avengers:_Infinity_War
Avengers:_Endgame
Now_You_See_Me
Django_Unchained
The_Farewell
The_Avengers
Parasite
Black_Swan
Spiderman:_Far_From_Home
21_Jump_Street
Les_Miserables
Joker
The_Perks_of_Being_a_Wallflower
Iron_Man_2
12_Years_a_Slave
The_Hunger_Games
Toy_Story_3
Thor
The_Hateful_Eight
Hidden_Figures
Ready_Player_One
The_Girls_with_the_Dragon_Tattoo
Captain_America:_The_First_Avenger
Her
Dunkirk
The_Great_Gatsby
Darkest_Hour
Three_Billboards_Outside_Ebbing
Once_Upon_a_Time_in_Hollywood
Manchester_by_the_Sea
Midnight_in_Paris
La_La_Land
Avengers:_Age_of_Ultron
Ex_Machina
The_Revenant
X-Men:_First_Class
Frozen
Jurassic_World
Pitch_Perfect
The_Help
Us
Drive
Mad_Max:_Fury_Road
Fast_Five
Pokemon_Detective_Pikachu
Good_Boys
Room
Terminator:_

In [3]:
# Constants
K = 4
T = movieRatings.shape[0]
NUM_MOVIES = movieRatings.shape[1]
NUM_ITERATION = 256

In [4]:
# Load prob initilization
probZ_init = np.loadtxt('hw8_probZ_init.txt') 
probR_givenZ_init = np.loadtxt('hw8_probR_init.txt')

In [5]:
# Helpers
def estep_numerator(i, t, probZ, probR_givenZ):
    j_rec = np.asarray(movieRatings[t,:] == "1").nonzero()
    j_notrec = np.asarray(movieRatings[t,:] == "0").nonzero()
    return probZ[i] * np.prod(probR_givenZ[j_rec,i]) * np.prod(1-probR_givenZ[j_notrec,i])

def estep_denominator(t, probZ, probR_givenZ):
    denom = 0
    j_rec = np.asarray(movieRatings[t,:] == "1").nonzero()
    j_notrec = np.asarray(movieRatings[t,:] == "0").nonzero()
    for i in range(K):
        denom += probZ[i] * np.prod(probR_givenZ[j_rec,i]) * np.prod(1-probR_givenZ[j_notrec,i])
    return denom

def mstep_probR_givenZ(i, j, posteriors, probR_givenZ):
    # Seen port
    t_seen = np.asarray(movieRatings[:,j] == "1").nonzero()
    sum_seen = np.sum(posteriors[i, t_seen])
    # Unseen part
    t_unseen = np.asarray(movieRatings[:,j] == "?").nonzero()
    sum_unseen = np.sum(posteriors[i, t_unseen]) * probR_givenZ[j, i]
    return sum_seen + sum_unseen

def mstep_prz(i, j, posteriors, priors):
    # sum over students who recommended movie j (I(r_j,1))
    t_seen, = np.where(movieRatings[:,j] == '1')
    numer_seen = np.sum(posteriors[i,t_seen])
    # sum over students who have not seen movie j
    t_unseen, = np.where(movieRatings[:,j] == '?')
    numer_unseen = priors[j,i]*np.sum(posteriors[i,t_unseen])
    return numer_seen+numer_unseen

def logLikelihood(probZ, probR_givenZ):
    logL = 0
    for t in range(T):
        likelihood = 0
        for i in range(K):
            j_rec = np.asarray(movieRatings[t,:] == "1").nonzero()
            j_notrec = np.asarray(movieRatings[t,:] == "0").nonzero()
            likelihood += probZ[i] * np.prod(probR_givenZ[j_rec, i]) * np.prod(1-probR_givenZ[j_notrec,i])
        logL += np.log(likelihood)
    return logL/T

def likelihood(t, pz, priors):
    cumsum = 0
    for i in range(K):
        j_rec, = np.where(movieRatings[t,:] == '1') 
        j_notrec, = np.where(movieRatings[t,:] == '0')
        cumsum += pz[i]*np.prod(priors[j_rec,i])*np.prod(1-priors[j_notrec,i])
    return cumsum

def EM():
    # Initialization
    probZ = np.copy(probZ_init)
    probR_givenZ = np.copy(probR_givenZ_init)
    posteriors = np.empty([K,T], dtype='float64')
    probZ_temp = np.empty(K)
    probR_givenZ_temp = np.empty([NUM_MOVIES, K])
    L = [] #log-likelihoods for each iteration
    
    for iteration in range(NUM_ITERATION+1):
        # Show the log-likelihood
        L.append(logLikelihood(probZ, probR_givenZ))
        if iteration in {0,1,2,4,8,16,32,64,128,256}:
            print("iteration: %d, log-likelihood L: %.4f" % (iteration, L[iteration]))
        
        # estep - update the posteriors
        for t in range(T):
            e_denom = estep_denominator(t, probZ, probR_givenZ)
            for i in range(K):
                posteriors[i,t] = estep_numerator(i, t, probZ, probR_givenZ)/e_denom
        # mstep - update the CPTs
        for i in range(K):
            sum_posteriors = np.sum(posteriors[i,:])
            probZ_temp[i] = sum_posteriors/T
            for j in range(NUM_MOVIES):
                probR_givenZ_temp[j, i] = mstep_probR_givenZ(i, j, posteriors, probR_givenZ)/sum_posteriors
                #probR_givenZ_temp[j, i] = mstep_prz(i, j, posteriors, probR_givenZ)/sum_posteriors
        # Update CPTs
        probZ = probZ_temp
        probR_givenZ = probR_givenZ_temp
        
    return L, posteriors, probZ, probR_givenZ

In [6]:
L, posteriors, probZ, probR_givenZ = EM()

iteration: 0, log-likelihood L: -27.0358
iteration: 1, log-likelihood L: -17.5604
iteration: 2, log-likelihood L: -16.0024
iteration: 4, log-likelihood L: -15.0606
iteration: 8, log-likelihood L: -14.5016
iteration: 16, log-likelihood L: -14.2638
iteration: 32, log-likelihood L: -14.1802
iteration: 64, log-likelihood L: -14.1701
iteration: 128, log-likelihood L: -14.1640
iteration: 256, log-likelihood L: -14.1637


In [7]:
# Constants
PID = "A59010711"
indexPID = studentIds.index(PID)
indexPID

361

In [9]:
my_data = movieRatings[indexPID,:]
my_unseen = np.asarray(my_data == '?').nonzero()[0]
expected_ratings = []

for l in my_unseen:
    exp_rating = 0
    for i in range(1):
        estep_term = estep_numerator(i, indexPID, probZ, probR_givenZ)/estep_denominator(indexPID, probZ, probR_givenZ)
        mstep_term = mstep_probR_givenZ(i,l, posteriors, probR_givenZ)/np.sum(posteriors[i,:])
        exp_rating += estep_term * mstep_term
    expected_ratings.append((exp_rating, movieTitles[l]))

expected_ratings.sort(reverse=True)   
pd.DataFrame(expected_ratings, columns=['Expected rating', 'Movie'])

Unnamed: 0,Expected rating,Movie
0,1.287446e-06,The_Farewell
1,1.145108e-06,21_Jump_Street
2,1.120129e-06,The_Perks_of_Being_a_Wallflower
3,1.107805e-06,Gone_Girl
4,1.030447e-06,Django_Unchained
5,9.7233e-07,12_Years_a_Slave
6,9.409807e-07,Dunkirk
7,9.387228e-07,Parasite
8,8.565328e-07,Us
9,8.301609e-07,Pitch_Perfect
