In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
id_file = 'hw8_ids.txt'
movie_file = 'hw8_movies.txt'
probR_file = 'hw8_probR_init.txt'
probZ_file = 'hw8_probZ_init.txt'
rating_file = 'hw8_ratings.txt'

In [23]:
def loadFile(filename, multi_col=False):
    contents = []
    with open(filename) as file:
        for line in file:
            temp = line.strip('\n').strip()
            if multi_col:
                temp = temp.split()
            contents.append(temp)
    
    return np.asarray([np.array(x) for x in contents])

In [167]:
id_df = loadFile(id_file)
movie_df = loadFile(movie_file)
probR_df = loadFile(probR_file, True).astype(np.float64)
probZ_df = loadFile(probZ_file).astype(np.float64)
rating_df = loadFile(rating_file, True)

In [168]:
recommended_df = np.zeros((rating_df.shape[1],))
seen_df = np.zeros((rating_df.shape[1],))

In [38]:
for row in range(rating_df.shape[0]):
    for col in range(rating_df.shape[1]):
        if rating_df[row][col] == '1':
            recommended_df[col] += 1
        if rating_df[row][col] != '?':
            seen_df[col] += 1

In [54]:
(recommended_df/seen_df)

dtype('float64')

In [41]:
popularity_df = recommended_df/seen_df

In [47]:
popularity_df = np.vstack((movie_df, popularity_df)).transpose((1, 0))

In [59]:
popularity_df = pd.DataFrame(data=popularity_df, columns=["movie", "popularity"])

## 8.1 a)

In [67]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
    print(popularity_df.sort_values(by=['popularity'], ascending=False))

                                           movie           popularity
0                                      Inception   0.9657534246575342
3                                 Shutter_Island   0.9444444444444444
21                         The_Dark_Knight_Rises   0.9416342412451362
47                                   The_Martian   0.9393939393939394
39                                  Interstellar   0.9385665529010239
40                      The_Theory_of_Everything   0.9195402298850575
11  Harry_Potter_and_the_Deathly_Hallows:_Part_2   0.9178082191780822
1                             The_Social_Network   0.9016393442622951
5   Harry_Potter_and_the_Deathly_Hallows:_Part_1   0.9006622516556292
37                                     Gone_Girl   0.8944099378881988
65                                 The_Lion_King   0.8943661971830986
28                           Wolf_of_Wall_Street   0.8916666666666667
50                        Avengers:_Infinity_War   0.8785714285714286
62                  

## 8.1 e)

In [123]:
rating_df[rating_df == '?'] = '2'

In [125]:
rating_df = rating_df.astype(np.int32)

In [169]:
likelihood_table = np.empty((rating_df.shape[0],))
P_Z_table = np.empty((probZ_df.shape[0]))
P_R_Z_table = np.empty((rating_df.shape[0], probZ_df.shape[0]))
estep_table = np.empty((rating_df.shape[0], probZ_df.shape[0]))

In [170]:
def get_P_R_Z(t_student, z_i):
    prod = 1
    for p in range(probR_df.shape[0]):
        if rating_df[t_student][p] != '?':
            if rating_df[t_student][p] == '1':
                prod = prod*probR_df[p][z_i]
            else:
                prod = prod*(1 - probR_df[p][z_i])
    return prod

In [171]:
def get_likelihood(t_student):
    likelihood = 0
    for z in range(probZ_df.shape[0]):
        #likelihood += get_P_R_Z(t_student, z) * probZ_df[z]
        likelihood = likelihood + (P_R_Z_table[t_student][z] * probZ_df[z])
    return likelihood

In [172]:
def get_estep(t_student, z_i):
    #return (probZ_df[z_i] * get_P_R_Z(t_student, z_i)) / get_likelihood(t_student)
    return (probZ_df[z_i] * P_R_Z_table[t_student][z_i]) / likelihood_table[t_student]

In [173]:
def update_P_Z(z_i):
    result = 0
    for t in range(rating_df.shape[0]):
        #result += get_estep(t, z_i)
        result += estep_table[t][z_i]
    return (result / rating_df.shape[0])

In [174]:
def update_P_R_Z(r_j, z_i):
    result = 0
    for t in range(rating_df.shape[0]):
        #rho = get_estep(t, z_i)
        rho = estep_table[t][z_i]
        if rating_df[t][r_j] != '?':
            if rating_df[t][r_j] == '1':
                result += rho
        else:
            #result += rho * get_P_R_Z(t, z_i)
            result += rho * probR_df[r_j][z_i]
    
    return result / (rating_df.shape[0] * update_P_Z(z_i))

In [175]:
def get_log_likelihood():
    result = 0
    for t in range(rating_df.shape[0]):
        #result += np.log(get_likelihood(t))
        result += np.log(likelihood_table[t])
    return result / rating_df.shape[0]

In [176]:
iterations = [0, 1, 2, 4, 8, 16, 32, 64, 128, 256]
for i in range(257):
    for t in range(rating_df.shape[0]):
        for z in range(probZ_df.shape[0]):
            P_R_Z_table[t][z] = get_P_R_Z(t, z)
        likelihood_table[t] = get_likelihood(t)
        for z in range(probZ_df.shape[0]):
            estep_table[t][z] = get_estep(t, z)
    
    if i in iterations:
        print(i, get_log_likelihood())
    
    for z in range(probZ_df.shape[0]):
        probZ_df[z] = update_P_Z(z)
    for m in range(probR_df.shape[0]):
        for z in range(probR_df.shape[1]):
            probR_df[m][z] = update_P_R_Z(m, z)

0 -27.03581500351123
1 -17.5604038243144
2 -16.002362630627825
4 -15.060597317892249
8 -14.501649272825004
16 -14.263788571437592
32 -14.180178075094366
64 -14.170077781591056
128 -14.163960358152188
256 -14.16369243900787


## 8.1 f)

In [187]:
myrating_index = (id_df == 'A59005342').nonzero()[0][0]
myratings = rating_df[myrating_index]

In [194]:
for r in range(rating_df.shape[1]):
    if rating_df[myrating_index][r] == '?':
        prob = 0
        for z in range(probZ_df.shape[0]):
            prob = prob + (estep_table[myrating_index][z] * probR_df[r][z])
        print(movie_df[r], prob)

Black_Swan 0.8727848959223924
Bridemaids 0.8086924623009729
Les_Miserables 0.9164247584086622
Magic_Mike 0.5611254414894956
12_Years_a_Slave 0.874849486233411
Fifty_Shades_of_Grey 0.4905191350999009
I_Feel_Pretty 0.34538457785655813
Chappaquidick 0.9921741329945875
La_La_Land 0.8583660461746279
Hidden_Figures 0.8861577217506736
Phantom_Thread 0.9093209197954123
Darkest_Hour 0.9748884119260098
The_Lion_King 0.9584398213502858
Rocketman 0.6921673559576419
Fast_&_Furious:_Hobbs_&_Shaw 0.7068480787561562
The_Farewell 0.8947347936947422
Hustlers 0.6054760739701981
