In [33]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import csr_matrix
import lda
from sklearn.decomposition import LatentDirichletAllocation

In [34]:

# Import the Movies dataset
movies = pd.read_csv('ml-latest-small/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [66]:
# ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [35]:
# Import the ratings dataset
ratings = pd.read_csv('ml-latest-small/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


Now that we know the structure of our dataset, how many records do we have in each of these tables?

In [36]:
print('The dataset contains: ', len(ratings), ' ratings of ', len(movies), ' movies.')

The dataset contains:  100836  ratings of  9742  movies.


## Movie-level Clustering

In [37]:
# Merge the two tables then pivot so we have Users X Movies dataframe
ratings_title = pd.merge(ratings, movies[['movieId', 'title']], on='movieId' )
user_movie_ratings = pd.pivot_table(ratings_title, index='userId', columns= 'title', values='rating')

print('dataset dimensions: ', user_movie_ratings.shape, '\n\nSubset example:')
user_movie_ratings.iloc[:6, :10]

dataset dimensions:  (610, 9719) 

Subset example:


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,,,,,,,,,,
2,,,,,,,,,,
3,,,,,,,,,,
4,,,,,,,,,,
5,,,,,,,,,,
6,,,,,,,,,,


In [38]:
# user_movie_ratings =  pd.pivot_table(ratings_title, index='userId', columns= 'title', values='rating')

To have sklearn run LDA topic modelling to a dataset with missing values like this, we will first cast it to the sparse csr matrix type defined in the SciPi library.

In [39]:
sparse_df = user_movie_ratings.astype(pd.SparseDtype("float64",fill_value=0))
sparse_ratings = csr_matrix(sparse_df.sparse.to_coo())

In [40]:
sparse_ratings.data[np.isnan(sparse_ratings.data)] = 0.0

# LDA

LDA introduces a latent variable (topics) 'z' that follows a categorical distribution of size k (hyperparmeter). 

It decomposes "User x Movie" (dim: U x M) sparse matrix into "User x Topics" (dim: U x k) and "Topics x Movie" (dim: k x M) matrices.

$$ \sum_{i=1}^{M} P( Movie=i | Topic) = 1, $$

AND

$$ \sum_{i=1}^{U} P(Topic=k | User) = 1, $$

In [41]:
n_topics = 20
mdl_lda = lda.LDA(n_topics=n_topics, n_iter=1000, random_state=1)
mdl_lda.fit(sparse_ratings.toarray().astype(int))

INFO:lda:n_documents: 610
INFO:lda:vocab_size: 9719
INFO:lda:n_words: 337872
INFO:lda:n_topics: 20
INFO:lda:n_iter: 1000
INFO:lda:<0> log likelihood: -4049863
INFO:lda:<10> log likelihood: -2935958
INFO:lda:<20> log likelihood: -2805373
INFO:lda:<30> log likelihood: -2763604
INFO:lda:<40> log likelihood: -2741010
INFO:lda:<50> log likelihood: -2727044
INFO:lda:<60> log likelihood: -2717520
INFO:lda:<70> log likelihood: -2710634
INFO:lda:<80> log likelihood: -2705019
INFO:lda:<90> log likelihood: -2700218
INFO:lda:<100> log likelihood: -2696827
INFO:lda:<110> log likelihood: -2693261
INFO:lda:<120> log likelihood: -2690425
INFO:lda:<130> log likelihood: -2688423
INFO:lda:<140> log likelihood: -2685960
INFO:lda:<150> log likelihood: -2684454
INFO:lda:<160> log likelihood: -2682573
INFO:lda:<170> log likelihood: -2682030
INFO:lda:<180> log likelihood: -2680702
INFO:lda:<190> log likelihood: -2679846
INFO:lda:<200> log likelihood: -2678526
INFO:lda:<210> log likelihood: -2676333
INFO:lda:<

<lda.lda.LDA at 0x7f96da6b9160>

In [42]:
topic_word = mdl_lda.topic_word_
n_top_words = 10
topic_words = []

Get top 10 rated movies in each topic

In [43]:
# for i, topic_dist in enumerate(topic_word):
#     topic_words.append(np.array(user_movie_ratings.columns)[np.argsort(topic_dist)][:-(n_top_words+1):-1])
#     print('Topic {}: {}'.format(i, ' '.join(topic_words[i])))

for i, topic_dist in enumerate(topic_word):
    topic_words.append(np.array(user_movie_ratings.columns)[np.argsort(topic_dist)][::-1])
    # print('Topic {}: {}'.format(i, ' '.join(topic_words[i])))

In [44]:
# skl_lda = LatentDirichletAllocation(n_components=3, random_state=1)
# skl_lda.fit(sparse_ratings)

## LDA measures

In [45]:
# `topic_word_` :
#         Alias for `components_`
#     `nzw_` : array, shape = [n_topics, n_features]
#         Matrix of counts recording topic-word assignments in final iteration.
#     `ndz_` : array, shape = [n_samples, n_topics]
#         Matrix of counts recording document-topic assignments in final iteration.
#     `doc_topic_` : array, shape = [n_samples, n_features]
#         Point estimate of the document-topic distributions (Theta in literature)
#     `nz_` : array, shape = [n_topics]
#         Array of topic assignment counts in final iteration.

print(f"topic_word (cluster-movie) distributions:\n{mdl_lda.topic_word_}")
print(f"topic-word (cluster-movie) assignments :\n{mdl_lda.nzw_}")
print(f"document-topic (user-cluster) assignments :\n{mdl_lda.ndz_}")
print(f"document-topic (user-cluster) distributions :\n{mdl_lda.doc_topic_}")
print(f"topic assignment counts :\n{mdl_lda.nz_}")



topic_word (cluster-movie) distributions:
[[1.24005015e-06 1.24005015e-06 1.24005015e-06 ... 1.24005015e-06
  1.24005015e-06 1.24005015e-06]
 [3.59891011e-07 1.44316295e-04 3.59891011e-07 ... 3.59891011e-07
  3.59891011e-07 3.59891011e-07]
 [4.78395881e-07 4.78395881e-07 2.87515925e-04 ... 4.78395881e-07
  4.78395881e-07 4.78395881e-07]
 ...
 [6.96175698e-07 6.96175698e-07 6.96175698e-07 ... 6.96175698e-07
  9.75342153e-04 7.03137455e-05]
 [5.09160044e-07 5.09160044e-07 5.09160044e-07 ... 5.09160044e-07
  5.09160044e-07 5.09160044e-07]
 [6.96903449e-07 6.96903449e-07 6.96903449e-07 ... 6.96903449e-07
  6.96903449e-07 6.96903449e-07]]
topic-word (cluster-movie) assignments :
[[ 0  0  0 ...  0  0  0]
 [ 0  4  0 ...  0  0  0]
 [ 0  0  6 ...  0  0  0]
 ...
 [ 0  0  0 ...  0 14  1]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]]
document-topic (user-cluster) assignments :
[[  0   0   0 ... 188   2   0]
 [  0   0   0 ...   0  89   0]
 [  0   0   0 ...   0   0  55]
 ...
 [ 48 181   0 ... 1

In [46]:
# rat_corr = ratings.corrwith(pd.DataFrame(mdl_lda.doc_topic_), axis=1)
sum(mdl_lda.nz_)

337872

In [47]:
# mov_cor = movies.corrwith(pd.DataFrame(mdl_lda.topic_word_), axis=0)

In [48]:
# mov_cor.info()
movies.shape
# ratings.shape
# mdl_lda.topic_word_.shape
mdl_lda.doc_topic_.shape

(610, 20)

In [49]:
user_movie_ratings.tail()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,4.5,3.5,,,
609,,,,,,,,,,,...,,,,,,,,,,
610,4.0,,,,,,,,3.5,,...,,4.0,3.5,3.0,,,2.0,1.5,,


In [50]:
#TODO: 
# 1. Find clusters of movies and users based on a probability threshold
# 2. Train classification model to classify users into one of the above 
# clusters based on their similarity measure between different cluster user groups and recommend movies from that group

In [51]:
movie_cluster_dist = pd.DataFrame(topic_word, columns=user_movie_ratings.columns, index=range(n_topics))
user_cluster_dist = pd.DataFrame(mdl_lda.doc_topic_, index=user_movie_ratings.index, columns=range(n_topics))

In [52]:
# sum(movie_cluster_dist.loc[:, 0])
movie_cluster_dist
user_cluster_dist

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0.000099,0.000099,0.000099,0.000099,0.000099,0.000099,0.000099,0.000099,0.472020,0.000099,0.263153,0.000099,0.075961,0.000099,0.000099,0.000099,0.000099,0.185320,0.002069,0.000099
2,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.000893,0.188393,0.000893,0.000893,0.795536,0.000893
3,0.001190,0.001190,0.001190,0.001190,0.001190,0.001190,0.001190,0.001190,0.322619,0.001190,0.001190,0.001190,0.001190,0.001190,0.001190,0.001190,0.001190,0.001190,0.001190,0.655952
4,0.000130,0.000130,0.281948,0.000130,0.000130,0.159870,0.000130,0.000130,0.000130,0.354675,0.175455,0.000130,0.000130,0.000130,0.000130,0.000130,0.000130,0.026104,0.000130,0.000130
5,0.000617,0.000617,0.000617,0.000617,0.000617,0.000617,0.000617,0.000617,0.000617,0.074691,0.000617,0.000617,0.000617,0.000617,0.908025,0.000617,0.000617,0.000617,0.000617,0.006790
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.000026,0.050000,0.069781,0.000026,0.000026,0.073165,0.000026,0.400859,0.000026,0.165565,0.035685,0.061452,0.032561,0.000026,0.007834,0.000026,0.102837,0.000026,0.000026,0.000026
607,0.039577,0.048028,0.000141,0.000141,0.000141,0.232535,0.000141,0.000141,0.439577,0.000141,0.000141,0.000141,0.157887,0.000141,0.080423,0.000141,0.000141,0.000141,0.000141,0.000141
608,0.019901,0.074928,0.000041,0.000041,0.239181,0.000041,0.000041,0.000041,0.160157,0.000041,0.172983,0.136988,0.035209,0.000041,0.031485,0.000041,0.000041,0.065825,0.000041,0.062929
609,0.204065,0.000813,0.000813,0.000813,0.000813,0.000813,0.000813,0.000813,0.000813,0.155285,0.000813,0.000813,0.000813,0.000813,0.626829,0.000813,0.000813,0.000813,0.000813,0.000813


In [53]:
sum(movie_cluster_dist.loc[0])
sum(user_cluster_dist.loc[:, 0])

32.145048556887524

In [54]:
sum(user_cluster_dist.loc[:, 0])

32.145048556887524

In [55]:
topic_word_assgn = mdl_lda.nzw_
doc_topic_assgn = mdl_lda.ndz_

In [56]:
movie_cluster_assgn = pd.DataFrame(topic_word_assgn, columns=user_movie_ratings.columns, index=range(n_topics))
user_cluster_assgn = pd.DataFrame(doc_topic_assgn, index=user_movie_ratings.index, columns=range(n_topics))

In [57]:
movie_corr = movie_cluster_assgn.corrwith(movie_cluster_dist, axis=0)
user_corr = user_cluster_assgn.corrwith(user_cluster_dist, axis=1)

In [58]:
# movie_cluster_assgn
print(movie_corr)
print(user_corr)

title
'71 (2014)                                   0.999998
'Hellboy': The Seeds of Creation (2004)      0.999954
'Round Midnight (1986)                       0.999988
'Salem's Lot (2004)                          0.999993
'Til There Was You (1997)                    0.999993
                                               ...   
eXistenZ (1999)                              0.954601
xXx (2002)                                   0.998473
xXx: State of the Union (2005)               0.999996
¡Three Amigos! (1986)                        0.981719
À nous la liberté (Freedom for Us) (1931)    0.999791
Length: 9719, dtype: float64
userId
1      1.0
2      1.0
3      1.0
4      1.0
5      1.0
      ... 
606    1.0
607    1.0
608    1.0
609    1.0
610    1.0
Length: 610, dtype: float64


In [59]:
x = 587
print(user_cluster_assgn.loc[x,:].idxmax())
print(user_cluster_dist.loc[x,:].idxmax())
print(len(user_cluster_assgn))

5
5
610


In [60]:
lda_predictions = []
for user in range(1, len(user_cluster_assgn)+1):
    lda_predictions.append(user_cluster_assgn.loc[user,:].idxmax())

In [61]:
lda_predictions = np.array(lda_predictions, dtype=int)
# lda_prediction[:]

In [62]:
np.savetxt("./preds/lda_preds.txt", lda_predictions.astype(int), fmt="%i")

In [63]:
print(movie_cluster_dist.loc[:, "(500) Days of Summer (2009)"])
print(movie_cluster_assgn.loc[:, "(500) Days of Summer (2009)"])

0     1.240050e-06
1     3.598910e-07
2     4.783959e-07
3     1.601376e-03
4     5.901728e-07
5     5.796366e-07
6     1.713183e-03
7     1.242670e-06
8     3.561483e-07
9     8.619062e-07
10    4.926305e-07
11    3.943499e-07
12    7.794724e-07
13    4.241115e-04
14    3.947391e-07
15    1.844166e-03
16    3.733005e-03
17    6.961757e-07
18    5.096692e-04
19    6.969034e-07
Name: (500) Days of Summer (2009), dtype: float64
0      0
1      0
2      0
3     15
4      0
5      0
6     40
7      0
8      0
9      0
10     0
11     0
12     0
13     3
14     0
15    31
16    46
17     0
18    10
19     0
Name: (500) Days of Summer (2009), dtype: int32


In [64]:
# search_str = "Willy Wonka & the Chocolate Factory (1971)"
# for ind, topic in enumerate(topic_words):
#     if search_str in topic:
#         print(f"Found {search_str} in topic {ind}")