### Tasks
* Create a Jupyter notebook.
* Load the data (which you can find in MovieTweetings).
* Use sklearn.decompose.nmf to create latent vectors for each movie.
* Save the vectors in the following format (user userid, how should have content_id1 and content_id3 recommended, with the predicted ratings being value1 and value2 respectively: Userid content_id1:value1 content_id3:value2
    * For example, for user 1000 (this is only a top-4 rec, list should contain 10-20),
        * 100 1375666:1.420 0482571:0.232 1457767:0.158 1130884:0.113

* Locate the recsys api template, where you should verify that the implementation will work with your implementation(/live-project/recs/non_negative_mf_recommender.py):
    * In the __init__ method, check if the implementation can load your trained vectors.
    * In the recommend_items method, return a recommendation for the user. Use the vectors loaded in the __init__ method.
* Start the MovieGeek site.
    * Find a user with a taste similar to yours by looking through users in the analytics part. This is user_id 100: http://0.0.0.0:8010/analytics/user/100/.
    * Look at the recommendations your algorithm provides.
* Write a report that describes
    * how you implemented your algorithm
    * how you trained the model
    * what you think of the result

In [12]:
from tqdm import tqdm

In [37]:
import pickle
import os

In [3]:
import re

import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
import matplotlib.pyplot as plt

In [4]:
# Data from https://github.com/sidooms/MovieTweetings
# user_id::movie_id::rating::rating_timestamp. 
rdf = pd.read_csv("MovieTweetings-master/latest/ratings.dat", 
                 sep="::", engine="python",
                 names=["user_id", "movie_id", "rating", "rating_timestamp"] )

rdf['rating_timestamp'] = pd.to_datetime(rdf['rating_timestamp'], unit='s')
rdf.head()

Unnamed: 0,user_id,movie_id,rating,rating_timestamp
0,1,114508,8,2013-10-05 21:00:50
1,2,75314,1,2020-07-23 01:42:04
2,2,102926,9,2020-05-22 11:46:56
3,2,114369,10,2020-08-16 05:22:27
4,2,118715,8,2020-07-29 07:13:18


In [5]:
# Helper functions
strip_parens = re.compile(r"\s+\(.*\)")
text ="In My Room (2020)"
# strip_parens.sub("", text)
def drop_parens(text):
    return strip_parens.sub("", text)
def extract_year(text):
    return text[text.rfind("(") + 1 : text.rfind(")")]
# extract_year(text)
# extract_year('Remélem legközelebb sikerül meghalnod:) (2018)')

In [6]:
# movies.dat
# Contains the items (i.e., movies) that were rated in the tweets,
# together with their genre metadata in the following 
# format: movie_id::movie_title (movie_year)::genre|genre|genre. For example:

# 0110912::Pulp Fiction (1994)::Crime|Thriller

mdf = pd.read_csv("MovieTweetings-master/latest/movies.dat", 
                 sep="::", engine="python",
                 names=["movie_id", "movie_title", "genres"] )
mdf.genres.fillna(value='', inplace=True)
mdf['title'] = mdf.movie_title.apply(drop_parens)
mdf['movie_year'] = mdf.movie_title.apply(extract_year)
mdf.movie_year = mdf.movie_year.astype('int')
mdf['genre_list'] = mdf.genres.apply(lambda x: x.split("|"))
del mdf['movie_title']
mdf.head()

Unnamed: 0,movie_id,genres,title,movie_year,genre_list
0,8,Documentary|Short,Edison Kinetoscopic Record of a Sneeze,1894,"[Documentary, Short]"
1,10,Documentary|Short,La sortie des usines Lumière,1895,"[Documentary, Short]"
2,12,Documentary|Short,The Arrival of a Train,1896,"[Documentary, Short]"
3,25,,The Oxford and Cambridge University Boat Race,1895,[]
4,91,Short|Horror,Le manoir du diable,1896,"[Short, Horror]"


In [None]:
 
# movies = ['mib', 'st', 'av', 'b', 'ss', 'lm']
# users = ['Sara', 'Jesper', 'Therese', 'Helle', 'Pietro', 'Ekaterina']

# M = pd.DataFrame([
#     [5.0, 3.0, 0.0, 2.0, 2.0, 2.0],
#     [4.0, 3.0, 4.0, 0.0, 3.0, 3.0],
#     [5.0, 2.0, 5.0, 2.0, 1.0, 1.0],
#     [3.0, 5.0, 3.0, 0.0, 1.0, 1.0],
#     [3.0, 3.0, 3.0, 2.0, 4.0, 5.0],
#     [2.0, 3.0, 2.0, 3.0, 5.0, 5.0]],
#     columns=movies,
#     index=users)

In [25]:
user_ids = list(sorted(set(rdf['user_id'].tolist())))

In [26]:
movie_ids = list(sorted(set(rdf['movie_id'].tolist())))

In [27]:
len(movie_ids), movie_ids[-1]

(36380, 12920708)

In [28]:
movie_indices = dict(zip( movie_ids,   range(len(movie_ids)) ))

In [29]:
user_ratings =[]
for uid in user_ids[:5]:
    tmpdf = rdf.query(f' user_id == {uid} ')
    m_ids = tmpdf['movie_id'].tolist()
    ratings = tmpdf['rating'].tolist()
    ratings_ar =  [0] * len(movie_ids)
    for m_id, rating in zip(m_ids, ratings):
        ratings_ar[movie_indices[movie_id]] = rating
    user_ratings.append(ratings_ar)
        

In [30]:
M = pd.DataFrame(user_ratings,    columns=movie_ids,    index=user_ids[:5])

In [31]:
U, Sigma, Vt = np.linalg.svd(M)

In [32]:
U

array([[-0.43322429, -0.43322429, -0.54153036, -0.37907125, -0.43322429],
       [-0.43322429, -0.13095179, -0.16368974, -0.11458282,  0.86904821],
       [-0.43322429,  0.86904821, -0.16368974, -0.11458282, -0.13095179],
       [-0.54153036, -0.16368974,  0.79538783, -0.14322852, -0.16368974],
       [-0.37907125, -0.11458282, -0.14322852,  0.89974004, -0.11458282]])

In [33]:
Sigma

array([18.46618531,  0.        ,  0.        ,  0.        ,  0.        ])

In [34]:
Vt

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [35]:
user_ratings =[]
if os.path.exists('user_ratings.pkl'):
    with open('user_ratings.pkl', 'rb') as fp:
        user_ratings = pickle.load(fp)
else:
    for uid in tqdm( user_ids, total=len(user_ids) ):
        tmpdf = rdf.query(f' user_id == {uid} ')
        m_ids = tmpdf['movie_id'].tolist()
        ratings = tmpdf['rating'].tolist()
        ratings_ar =  [0] * len(movie_ids)
        for m_id, rating in zip(m_ids, ratings):
            ratings_ar[movie_indices[movie_id]] = rating
        user_ratings.append(ratings_ar)

100%|██████████| 69324/69324 [08:33<00:00, 135.00it/s]  


In [38]:
if not os.path.exists('user_ratings.pkl'):
    with open ('user_ratings.pkl', 'wb') as fout:
        pickle.dump(user_ratings, fout)

In [51]:
if not os.path.exists('movie_ids.pkl'):
    with open('movie_ids.pkl', 'wb') as fout:
        pickle.dump(movie_ids, fout)
    with open('user_ids.pkl', 'wb') as fout:
        pickle.dump(user_ids, fout)

In [40]:
! ls -lh user_ratings.pkl

-rw-r--r--  1 todd  staff   4.7G Jan  6 10:37 user_ratings.pkl


In [41]:
del user_ratings
# 20 gb mem!

In [42]:
del mdf
del rdf

In [36]:
X = np.array(user_ratings)

In [43]:
del M
del U
del Vt
del Sigma

In [44]:
# >>> import numpy as np
# >>> X = np.array([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
# >>> from sklearn.decomposition import NMF
num_components =10
model = NMF(n_components=num_components, init='random', random_state=0)
W = model.fit_transform(X)
# >>> H = model.components_

In [45]:
indices_movies = dict(zip(range(len(movie_ids)), movie_ids))

In [47]:
# 100 1375666:1.420 0482571:0.232 1457767:0.158 1130884:0.113
with open(f"nmf.weights.{num_components}.txt",  'wt') as fout:
    for idx, user_id in  tqdm(enumerate(user_ids), total=len(user_ids)):
        weights = W[idx]
        values =[]
        for mid, weight in zip(movie_ids, weights ):
            values.append(f"{mid}:{weight:3f}")
        fout.write(f"{user_id} {' '.join(values)}")

100%|██████████| 69324/69324 [00:01<00:00, 64266.34it/s]


In [48]:
! ls -lh nmf.*

-rw-r--r--  1 todd  staff   8.5M Jan  6 10:56 nmf.weights.10.txt


In [49]:
! cp nmf.weights.10.txt live-project/nnmf_recs.csv

In [None]:
# Look at the recommendations your algorithm provides.
# Write a report that describes
#     how you implemented your algorithm
#     how you trained the model
#     what you think of the result
