In [389]:
import pandas as pd
import numpy as np
from datetime import datetime

from tqdm import tqdm_notebook
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt

%matplotlib inline

In [390]:
links = pd.read_csv('../001-intro/links.csv')
movies = pd.read_csv('../001-intro/movies.csv')
ratings = pd.read_csv('../001-intro/ratings.csv')
tags = pd.read_csv('../001-intro/tags.csv')

### 1. Рекомендация на основе содержания

In [391]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [392]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [393]:
# группируем тэги по фильмам

In [394]:
movietags = tags[['userId', 'movieId', 'tag']]

In [395]:
movietags['tags'] = movietags.groupby(['movieId'])['tag'].transform(lambda x: ' '.join(x))

In [396]:
movietags = movietags[['movieId','tags']].drop_duplicates()

In [397]:
movietags.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1572 entries, 0 to 3680
Data columns (total 2 columns):
movieId    1572 non-null int64
tags       1572 non-null object
dtypes: int64(1), object(1)
memory usage: 36.8+ KB


In [398]:
movietags.head()

Unnamed: 0,movieId,tags
0,60756,funny Highly quotable will ferrell comedy funn...
3,89774,Boxing story MMA Tom Hardy
6,106782,drugs Leonardo DiCaprio Martin Scorsese Stock ...
9,48516,way too long Leonardo DiCaprio suspense twist ...
10,431,Al Pacino gangster mafia


In [399]:
# Объединяем тэги и жанры

In [400]:
movietags = pd.merge(movietags, movies, on='movieId', how='inner')

In [401]:
movietags['year'] = movietags['title'].str.extract(r'(\d{4})')

In [402]:
movietags.head()

Unnamed: 0,movieId,tags,title,genres,year
0,60756,funny Highly quotable will ferrell comedy funn...,Step Brothers (2008),Comedy,2008
1,89774,Boxing story MMA Tom Hardy,Warrior (2011),Drama,2011
2,106782,drugs Leonardo DiCaprio Martin Scorsese Stock ...,"Wolf of Wall Street, The (2013)",Comedy|Crime|Drama,2013
3,48516,way too long Leonardo DiCaprio suspense twist ...,"Departed, The (2006)",Crime|Drama|Thriller,2006
4,431,Al Pacino gangster mafia,Carlito's Way (1993),Crime|Drama,1993


In [403]:
movietags['genres1'] = movietags['genres'].transform(lambda x: x.lower().replace('|', ' '))

In [404]:
movietags['tags1'] = movietags['tags'].transform(lambda x: x.lower())

In [405]:
movietags.head()

Unnamed: 0,movieId,tags,title,genres,year,genres1,tags1
0,60756,funny Highly quotable will ferrell comedy funn...,Step Brothers (2008),Comedy,2008,comedy,funny highly quotable will ferrell comedy funn...
1,89774,Boxing story MMA Tom Hardy,Warrior (2011),Drama,2011,drama,boxing story mma tom hardy
2,106782,drugs Leonardo DiCaprio Martin Scorsese Stock ...,"Wolf of Wall Street, The (2013)",Comedy|Crime|Drama,2013,comedy crime drama,drugs leonardo dicaprio martin scorsese stock ...
3,48516,way too long Leonardo DiCaprio suspense twist ...,"Departed, The (2006)",Crime|Drama|Thriller,2006,crime drama thriller,way too long leonardo dicaprio suspense twist ...
4,431,Al Pacino gangster mafia,Carlito's Way (1993),Crime|Drama,1993,crime drama,al pacino gangster mafia


In [406]:
# Объединяем в один список "год выхода фильма" + "жанры" + "тэги"

In [407]:
movietags['gentags'] = movietags['year']+ ' ' + movietags['genres1']+ ' ' + movietags['tags1']
movietags = movietags[['movieId', 'title', 'gentags']]

In [408]:
movietags[pd.isnull(movietags).any(axis=1)]

Unnamed: 0,movieId,title,gentags
1537,156605,Paterson,


In [409]:
movietags = movietags.dropna()

In [410]:
movietags.head()

Unnamed: 0,movieId,title,gentags
0,60756,Step Brothers (2008),2008 comedy funny highly quotable will ferrell...
1,89774,Warrior (2011),2011 drama boxing story mma tom hardy
2,106782,"Wolf of Wall Street, The (2013)",2013 comedy crime drama drugs leonardo dicapri...
3,48516,"Departed, The (2006)",2006 crime drama thriller way too long leonard...
4,431,Carlito's Way (1993),1993 crime drama al pacino gangster mafia


In [411]:
# разделяем "test" и "train"

In [412]:
movietags_test = movietags.iloc[:10]
movietags_train = movietags.iloc[10:]

In [413]:
# обучаем NearestNeighbors

In [414]:
movietags_train.head()

Unnamed: 0,movieId,title,gentags
10,144210,Just Eat It: A Food Waste Story (2014),2014 documentary dumpster diving sustainability
11,1569,My Best Friend's Wedding (1997),1997 comedy romance romantic comedy wedding we...
12,118985,Big Eyes (2014),2014 drama painter
13,119141,The Interview (2014),2014 action comedy bloody bromance comedy funn...
14,109487,Interstellar (2014),2014 sci-fi imax black hole sci-fi time-travel...


In [415]:
vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, stop_words='english',norm = None)

In [416]:
gentags_on_movie = movietags_train.gentags.values

In [417]:
X = vectorizer.fit_transform(gentags_on_movie)

In [418]:
X_vovab = vectorizer.get_feature_names()

In [419]:
len(X_vovab)

1763

In [420]:
X_vovab[:10]

['06',
 '1000',
 '1900s',
 '1920s',
 '1921',
 '1922',
 '1923',
 '1924',
 '1925',
 '1926']

In [421]:
X_mat = X.todense()

In [422]:
X_mat[:10]

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [423]:
nb = NearestNeighbors(n_neighbors=7, n_jobs=1, metric='euclidean')

In [424]:
nb.fit(X_mat)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=1, n_neighbors=7, p=2, radius=1.0)

In [425]:
# Находим рекомендации для тестовой выборки

In [426]:
testItems = movietags_test.gentags.tolist()

In [427]:
X_test = vectorizer.transform(testItems)

In [428]:
X_mat_test = X_test.todense()

In [429]:
X_mat_test

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [430]:
res = nb.kneighbors(X_mat_test, return_distance=True)

In [431]:
res

(array([[23.5745505 , 23.76621187, 25.56295558, 26.78708176, 26.85517618,
         27.44570073, 27.7204556 ],
        [13.55890217, 13.59045392, 13.63780279, 14.06294535, 14.09322278,
         14.10566763, 14.19772015],
        [19.88213299, 19.94450367, 19.95888083, 20.05036709, 20.09733294,
         20.4024628 , 20.45879485],
        [27.54101402, 27.66385238, 27.99924199, 28.38390042, 28.39393412,
         28.43764247, 28.48735088],
        [12.03126642, 13.79515201, 13.93623076, 14.04658462, 14.56745168,
         14.60360745, 14.65550035],
        [13.62302696, 13.76586983, 13.87757846, 14.40455807, 14.44112167,
         14.49359635, 14.52500492],
        [12.65351614, 12.86524998, 12.96213288, 13.34117369, 13.34690312,
         14.32974936, 15.27012786],
        [ 7.74891999, 10.92111421, 11.18806506, 11.29853152, 11.48437234,
         11.91185395, 12.00016638],
        [10.60050365, 12.67039972, 13.32282689, 13.59962354, 13.62889385,
         14.08427564, 14.12686264],
        [ 

In [432]:
movietags_test.iloc[4]

movieId                                          431
title                           Carlito's Way (1993)
gentags    1993 crime drama al pacino gangster mafia
Name: 4, dtype: object

In [433]:
movietags_train.iloc[res[1][4]]

Unnamed: 0,movieId,title,gentags
229,4262,Scarface (1983),1983 action crime drama al pacino
262,16,Casino (1995),1995 crime drama mafia
605,1466,Donnie Brasco (1997),1997 crime drama mafia
517,1213,Goodfellas (1990),1990 crime drama mafia
718,2247,Married to the Mob (1988),1988 comedy mafia
720,2249,My Blue Heaven (1990),1990 comedy mafia
404,858,"Godfather, The (1972)",1972 crime drama mafia


In [434]:
len(movietags_test)

10

In [435]:
# Выводим результат: По каждому фильму из тестовой выборке выводим 7 рекомендаций

In [436]:
for i in range(len(movietags_test)):
    print('Test movie:')
    print('movieId: ', movietags_test.iloc[i].movieId)
    print('title: ', movietags_test.iloc[i].title)
    print('gentags: ', movietags_test.iloc[i].gentags)    
    print('7nn:')
    print('\tmovieId\ttitle')
    for k in range(len(res[1][i])):
        print('\t'+str(movietags_train.iloc[res[1][i][k]].movieId)+'\t'+movietags_train.iloc[res[1][i][k]].title)
        print('\t  gentags:'+movietags_train.iloc[res[1][i][k]].gentags)
    print('-----------------------------------------------------------------------------')
        
    

Test movie:
movieId:  60756
title:  Step Brothers (2008)
gentags:  2008 comedy funny highly quotable will ferrell comedy funny will ferrell funny will ferrell
7nn:
	movieId	title
	6188	Old School (2003)
	  gentags:2003 comedy comedy will ferrell
	107348	Anchorman 2: The Legend Continues (2013)
	  gentags:2013 comedy comedy steve carell stupid but funny will ferrell
	8641	Anchorman: The Legend of Ron Burgundy (2004)
	  gentags:2004 comedy hilarious steve carell will ferrell stupid awesome comedy will ferrell
	126548	The DUFF (2015)
	  gentags:2015 comedy funny high school
	167746	The Lego Batman Movie (2017)
	  gentags:2017 action animation comedy funny heartwarming
	148626	Big Short, The (2015)
	  gentags:2015 drama funny interesting witty
	106766	Inside Llewyn Davis (2013)
	  gentags:2013 drama atmospheric cinematography depressing funny
-----------------------------------------------------------------------------
Test movie:
movieId:  89774
title:  Warrior (2011)
gentags:  2011 drama

### 2. Предсказываем оценку

In [437]:
gentags_on_movie = movietags.gentags.values

In [438]:
X = vectorizer.fit_transform(gentags_on_movie)

In [439]:
X_vovab = vectorizer.get_feature_names()

In [440]:
len(X_vovab)

1775

In [441]:
X_mat = X.todense()

In [442]:
df = pd.concat([movietags, pd.DataFrame(X_mat)], axis=1)

In [443]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1572 entries, 0 to 1571
Columns: 1778 entries, movieId to 1774
dtypes: float64(1776), object(2)
memory usage: 21.3+ MB


In [444]:
df.head()

Unnamed: 0,movieId,title,gentags,0,1,2,3,4,5,6,...,1765,1766,1767,1768,1769,1770,1771,1772,1773,1774
0,60756.0,Step Brothers (2008),2008 comedy funny highly quotable will ferrell...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,89774.0,Warrior (2011),2011 drama boxing story mma tom hardy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,106782.0,"Wolf of Wall Street, The (2013)",2013 comedy crime drama drugs leonardo dicapri...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,48516.0,"Departed, The (2006)",2006 crime drama thriller way too long leonard...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,431.0,Carlito's Way (1993),1993 crime drama al pacino gangster mafia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [445]:
X_vovab[:10]

['06',
 '1000',
 '1900s',
 '1920s',
 '1921',
 '1922',
 '1923',
 '1924',
 '1925',
 '1926']

In [446]:
cols = np.concatenate((np.array(['movieId', 'title', 'gentags']), np.array(X_vovab)))

In [447]:
cols

array(['movieId', 'title', 'gentags', ..., 'zombie', 'zombies', 'zooey'],
      dtype='<U16')

In [448]:
df.columns = cols

In [449]:
df.head()

Unnamed: 0,movieId,title,gentags,06,1000,1900s,1920s,1921,1922,1923,...,wrongful,wry,york,younger,zellweger,zither,zoe,zombie,zombies,zooey
0,60756.0,Step Brothers (2008),2008 comedy funny highly quotable will ferrell...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,89774.0,Warrior (2011),2011 drama boxing story mma tom hardy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,106782.0,"Wolf of Wall Street, The (2013)",2013 comedy crime drama drugs leonardo dicapri...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,48516.0,"Departed, The (2006)",2006 crime drama thriller way too long leonard...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,431.0,Carlito's Way (1993),1993 crime drama al pacino gangster mafia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [450]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [451]:
ratings1 = ratings.groupby('movieId').agg({'rating':'mean', 'userId':'count'})
ratings1 = ratings1.rename(columns = {'userId':'votes'})

In [452]:
ratings1.head()

Unnamed: 0_level_0,rating,votes
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.92093,215
2,3.431818,110
3,3.259615,52
4,2.357143,7
5,3.071429,49


In [453]:
df1 = pd.merge(ratings1, df, on='movieId', how='inner')

In [454]:
df1.head()

Unnamed: 0,movieId,rating,votes,title,gentags,06,1000,1900s,1920s,1921,...,wrongful,wry,york,younger,zellweger,zither,zoe,zombie,zombies,zooey
0,1,3.92093,215,Toy Story (1995),1995 adventure animation children comedy fanta...,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,3.431818,110,Jumanji (1995),1995 adventure children fantasy fantasy magic ...,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,3.259615,52,Grumpier Old Men (1995),1995 comedy romance moldy old,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,3.071429,49,Father of the Bride Part II (1995),1995 comedy pregnancy remake,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,3.185185,54,Sabrina (1995),1995 comedy romance remake,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [455]:
df2 = df1.drop(['votes', 'title', 'gentags'], axis=1)

In [456]:
df2.head()

Unnamed: 0,movieId,rating,06,1000,1900s,1920s,1921,1922,1923,1924,...,wrongful,wry,york,younger,zellweger,zither,zoe,zombie,zombies,zooey
0,1,3.92093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,3.431818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,3.259615,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,3.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,3.185185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [457]:
df2[pd.isnull(df2).any(axis=1)]

Unnamed: 0,movieId,rating,06,1000,1900s,1920s,1921,1922,1923,1924,...,wrongful,wry,york,younger,zellweger,zither,zoe,zombie,zombies,zooey
687,3265,4.0,,,,,,,,,...,,,,,,,,,,


In [458]:
df2 = df2.dropna()

In [459]:
data_train, data_test, y_train, y_test = train_test_split(df2.drop(['movieId', 'rating'], axis=1), df2[['rating']], test_size=0.3, random_state=42)

In [460]:
n_neighbors = 7
knn = KNeighborsRegressor( n_neighbors, weights = 'distance' )
y_pred = knn.fit( data_train, y_train ).predict( data_test )

In [461]:
print('RMSE: ', mean_squared_error(y_test, y_pred))

RMSE:  0.27405929390227896
