In [6]:
import pandas as pd
import numpy as np

In [12]:
movies = pd.read_csv('./netology-recsys-master/lecture-1/movies.csv')
tags = pd.read_csv('./netology-recsys-master/lecture-1/tags.csv')
ratings = pd.read_csv('./netology-recsys-master/lecture-1/ratings.csv')

## tags

In [13]:
tags_grpd = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x))
tags_unique = pd.DataFrame(tags['movieId'].unique(), columns=['movieId'])
tags_grpd_df = pd.merge(tags_unique, tags_grpd, on='movieId')

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
tf = TfidfVectorizer()

In [34]:
tf_tags = tf.fit_transform(tags_grpd_df['tag']).todense()

In [36]:
cols = [(k, tf.vocabulary_[k]) for k in tf.vocabulary_]
cols = sorted(cols, key=lambda c: c[1])
cols = [c[0] for c in cols]

In [38]:
tags_features = pd.DataFrame(tf_tags, columns=cols)

In [47]:
movies_tags = pd.concat((tags_grpd_df, tags_features), axis=1)

## genres

In [59]:
movies['gnr_splt'] = movies.genres.str.split('|')

In [62]:
movies['gnr_space'] = movies.apply(lambda r: ' '.join(r.gnr_splt), axis=1)

In [64]:
tf = TfidfVectorizer()

In [74]:
tf_genres = tf.fit_transform(movies['gnr_space']).todense()

In [75]:
cols = [(k, tf.vocabulary_[k]) for k in tf.vocabulary_]
cols = sorted(cols, key=lambda c: c[1])
cols = [c[0] for c in cols]

In [77]:
gnr_features = pd.DataFrame(tf_genres, columns=cols)

In [81]:
movies_genres = pd.concat((movies, gnr_features), axis = 1)

In [85]:
del movies_genres['genres']
del movies_genres['gnr_splt']
del movies_genres['gnr_space']

In [86]:
del movies_tags['tag']

## merge

In [137]:
mgt = movies_genres.join(movies_tags, on='movieId', lsuffix='_genres', rsuffix='_tags')
mgt = mgt.fillna(0)
mgt.rename(columns={'movieId_genres':'movieId'}, inplace=True)

del mgt['movieId_tags']

 ## ratings

In [145]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [156]:
ratings_total = ratings.groupby('movieId')['userId'].count().reset_index(name = 'userId')
ratings_total.columns = ['movieId', 'users']

In [157]:
ratings_mean = ratings.groupby('movieId')['rating'].mean().reset_index(name = 'rating')
ratings_mean.columns = ['movieId', 'rating_mean']

In [165]:
ratings_users_count = len(ratings['userId'].unique())

In [161]:
ratings_weighted = pd.merge(ratings_total, ratings_mean, on = 'movieId')

In [166]:
ratings_weighted['rating_wgh'] = ratings_weighted['rating_mean'] * ratings_weighted['users'] / ratings_users_count

In [177]:
del ratings_weighted['users']
del ratings_weighted['rating_mean']

In [184]:
mgtr = mgt.join(ratings_weighted, on = 'movieId', lsuffix = '_mgt', rsuffix = '_rw')

In [185]:
del mgtr['movieId_rw']

mgtr.rename(columns={'movieId_mgt':'movieId'}, inplace=True)
mgtr = mgtr.fillna(np.min(mgtr['rating_wgh']))

## modeling time

In [188]:
mgtr.head()

Unnamed: 0,movieId,title,action_genres,adventure_genres,animation_genres,children_genres,comedy_genres,crime_genres,documentary_genres,drama_genres,...,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey,rating_wgh
0,1,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.618852
1,2,Jumanji (1995),0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.277869
2,3,Grumpier Old Men (1995),0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027049
3,4,Waiting to Exhale (1995),0.0,0.0,0.0,0.0,0.505015,0.0,0.0,0.466405,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.246721
4,5,Father of the Bride Part II (1995),0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.659836


In [196]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [197]:
print(ratings.shape)
print(mgtr.shape)

(100836, 4)
(9742, 1769)


In [201]:
user_tfidf = pd.merge(ratings[['userId', 'movieId']], mgtr, on='movieId')

In [202]:
user_tfidf.head()

Unnamed: 0,userId,movieId,title,action_genres,adventure_genres,animation_genres,children_genres,comedy_genres,crime_genres,documentary_genres,...,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey,rating_wgh
0,1,1,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.618852
1,5,1,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.618852
2,7,1,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.618852
3,15,1,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.618852
4,17,1,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.618852


In [205]:
user = user_tfidf[user_tfidf.userId==228]

Unnamed: 0,userId,movieId,title,action_genres,adventure_genres,animation_genres,children_genres,comedy_genres,crime_genres,documentary_genres,...,you,younger,your,zellweger,zither,zoe,zombie,zombies,zooey,rating_wgh
645,228,50,"Usual Suspects, The (1995)",0.0,0.0,0.0,0.0,0.0,0.553854,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006557
1664,228,260,Star Wars: Episode IV - A New Hope (1977),0.432883,0.492725,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.467213
2769,228,362,"Jungle Book, The (1994)",0.0,0.548907,0.0,0.664786,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.009836
6860,228,1196,Star Wars: Episode V - The Empire Strikes Back...,0.432883,0.492725,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.040164
7932,228,1214,Alien (1979),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.118852
12745,228,2571,"Matrix, The (1999)",0.446563,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001639
16412,228,318,"Shawshank Redemption, The (1994)",0.0,0.0,0.0,0.0,0.0,0.863943,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.035246
28174,228,65,Bio-Dome (1996),0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07377
43716,228,1385,Under Siege (1992),0.64163,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006557
44582,228,1590,Event Horizon (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.109016


In [207]:
nn_cols = user_tfidf.columns[3:]

In [208]:
from sklearn.neighbors import NearestNeighbors

In [209]:
nn = NearestNeighbors(10)

In [210]:
nn.fit(user_tfidf[nn_cols])

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                 radius=1.0)