In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook, tqdm

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [2]:
links = pd.read_csv('ml-latest/links.csv')
movies = pd.read_csv('ml-latest/movies.csv')
ratings = pd.read_csv('ml-latest/ratings.csv')
tags = pd.read_csv('ml-latest/tags.csv')

In [3]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [4]:
tags.dropna(inplace=True)

In [5]:
tags['tag']=[x.lower() for x in tags['tag'].values]

Для каждого фильма берем 30 тегов, которые давали ему пользователи

In [6]:
films_tag={}
for i in tqdm(set(tags['movieId'])):
    table=tags[tags['movieId']==i]
    a=table['tag'].value_counts()[:30].index.tolist()
    a=' '.join(a)
    films_tag[i]=a

100%|███████████████████████████████████████████████████████████████████████████| 45981/45981 [04:24<00:00, 173.94it/s]


In [7]:
films_tag=pd.DataFrame.from_dict(films_tag, orient='index')

In [8]:
films_tag.reset_index(inplace=True)

In [9]:
movies_with_tags=movies.merge(films_tag, right_on='index', left_on='movieId')

In [10]:
movies_with_tags['tag']=movies_with_tags[0]

In [11]:
movies_with_tags.drop([0, 'index'], axis=1, inplace=True)

In [12]:
movies_with_tags.head(5)

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,pixar animation disney tom hanks funny compute...
1,2,Jumanji (1995),Adventure|Children|Fantasy,robin williams fantasy time travel board game ...
2,3,Grumpier Old Men (1995),Comedy|Romance,jack lemmon walter matthau fishing sequel old ...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,chick flick characters girl movie divorce reve...
4,5,Father of the Bride Part II (1995),Comedy,steve martin pregnancy family comedy wedding c...


In [13]:
movie_genres = [change_string(g) for g in movies_with_tags.genres.values]
movies_with_tags['genres']=movie_genres

In [14]:
movies_with_tags['genres_tags']=movies_with_tags['genres']+' '+movies_with_tags['tag']

In [15]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,tag,genres_tags
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar animation disney tom hanks funny compute...,Adventure Animation Children Comedy Fantasy pi...
1,2,Jumanji (1995),Adventure Children Fantasy,robin williams fantasy time travel board game ...,Adventure Children Fantasy robin williams fant...
2,3,Grumpier Old Men (1995),Comedy Romance,jack lemmon walter matthau fishing sequel old ...,Comedy Romance jack lemmon walter matthau fish...
3,4,Waiting to Exhale (1995),Comedy Drama Romance,chick flick characters girl movie divorce reve...,Comedy Drama Romance chick flick characters gi...
4,5,Father of the Bride Part II (1995),Comedy,steve martin pregnancy family comedy wedding c...,Comedy steve martin pregnancy family comedy we...


для начала сделаем модель, предсказывающую среднюю оценку пользователей в зависимости от тегов и жанра

In [16]:
av_rating=ratings.groupby(['movieId']).median().reset_index()

In [17]:
movies_with_tags=movies_with_tags.merge(av_rating, on='movieId', how='left')

In [18]:
movies_with_tags.dropna(inplace=True)

In [19]:
count_vect = CountVectorizer()
tfidf_transformer = TfidfTransformer()

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
from sklearn.neighbors import KNeighborsRegressor
y=movies_with_tags['rating']
X_train_counts_genres_tags= count_vect.fit_transform(movies_with_tags['genres_tags'])
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts_genres_tags)
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, y, test_size=0.33, random_state=42)
neigh = KNeighborsRegressor(n_neighbors=30, metric='euclidean')
neigh.fit(X_train, y_train)
y_pred=neigh.predict(X_test)

In [22]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)**(1/2)

0.7011732859625841

на тестовой выборке мы получили целевое значение метрики, но она характеризует, насколько мы хорошо предсказываем средний рейтинг фильма. То есть мы всем можем рекомендовать фильмы, которые по прогнозам будут иметь высокий средний рейтинг. Посмотрим, как хорошо работает модель для конкретных пользователей. выберем фильмы, которым пользователи ставили хорошие оценки, подберем им ближайшего соседа, подтянем фильм-ближайший сосед, посмотрим какую оценку реально ему поставил пользователь, сравним их

In [23]:
rating_top=ratings[ratings['rating']>=4]

In [24]:
y_pred_1=neigh.predict(X_test)

In [25]:
rating_top.head()

Unnamed: 0,userId,movieId,rating,timestamp
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264
7,1,2134,4.5,1256677464
8,1,2478,4.0,1256677239
11,1,3020,4.0,1256677260


In [26]:
rec={}
for i in rating_top['movieId'][:5000]:
    try:
        test=movies_with_tags[movies_with_tags['movieId']==i]['genres_tags']
        predict = count_vect.transform(test)
        X_tfidf2 = tfidf_transformer.transform(predict)
        res = neigh.kneighbors(X_tfidf2, return_distance=True)
        film=res[1].tolist()[0][res[0].tolist()[0].index(res[0].min())]
    except:
        film=0
    rec[i]=film

In [27]:
rec_pd=pd.DataFrame.from_dict(rec, orient='index')

In [28]:
rec_pd=rec_pd.reset_index()

In [29]:
rating_top=rating_top.merge(rec_pd, right_on='index', left_on='movieId', how='left')

In [30]:
rating_top['recommendations']=rating_top[0]

In [31]:
rating_top.head()

Unnamed: 0,userId,movieId,rating,timestamp,index,0,recommendations
0,1,1257,4.5,1256677460,1257.0,7582.0,7582.0
1,1,1449,4.5,1256677264,1449.0,1097.0,1097.0
2,1,2134,4.5,1256677464,2134.0,16471.0,16471.0
3,1,2478,4.0,1256677239,2478.0,19970.0,19970.0
4,1,3020,4.0,1256677260,3020.0,23661.0,23661.0


In [32]:
rating_top_rec=rating_top.merge(ratings, left_on=['userId','recommendations'], right_on=['userId', 'movieId'], how='left')

In [33]:
rating_top_rec.dropna(inplace=True)

In [34]:
mean_squared_error(rating_top_rec['rating_x'], rating_top_rec['rating_y'])**(1/2)

1.3243999666914437

метрика вышла за целевое значение, учитывая то, что в оценки участвовали и тестовые, и тренеровочные данные. 
Но в оценке учитываются случаи, когда реальная оценка по ближайшему соседу (рекомендуемого фильма) выше оценки изначального фильма. Поскольку данная ситуация положительная, уберем данные случаи

In [44]:
rating2=[]
for x in tqdm(range(len(rating_top_rec['rating_x']))):
    if rating_top_rec['rating_x'].values[x]<rating_top_rec['rating_y'].values[x]:
        rating2.append(rating_top_rec['rating_y'].values[x])
    else:
        rating2.append(rating_top_rec['rating_x'].values[x])
rating_top_rec['rating_x_2']=rating2

100%|███████████████████████████████████████████████████████████████████████| 145839/145839 [00:02<00:00, 61791.16it/s]


In [46]:
mean_squared_error(rating_top_rec['rating_x_2'], rating_top_rec['rating_y'])**(1/2)

1.2936414167431567

лучше не стало

Посмотрим, какие оценки должны были получиться, если бы  мы их предсказывали по факторам

In [49]:
rating_top_rec2=rating_top.merge(movies_with_tags, left_on=['recommendations'], right_on=['movieId'], how='left')

In [50]:
rating_top_rec2.dropna(inplace=True)

In [51]:
rating_top_rec2=rating_top_rec2[-5000:]

In [52]:
X_counts_genres_tags= count_vect.transform(rating_top_rec2['genres_tags'])
X_tfidf = tfidf_transformer.transform(X_counts_genres_tags)
y_pred=neigh.predict(X_tfidf)

In [53]:
mean_squared_error(y_pred, rating_top_rec2['rating_y'])**(1/2)

0.5722010743125571

Видим, что мы попадаем в целевую метрику (но помним, что считали метрику и на тестовой, и на тренировочной выборке). Получается, модель нормального качества, но ближайшие соседи оказались не такими близкими

Попробуем разделить людей на сегменты и включить номер сегмента в факторы.

In [54]:
ratings.dropna(inplace=True)

In [55]:

ratings_piv=pd.pivot_table(ratings[:500000], columns=['movieId'], index=['userId'],values=['rating'], aggfunc=np.mean)

In [56]:
ratings_piv=ratings_piv.fillna(0)
ratings_piv=ratings_piv.reset_index()

In [57]:
X=ratings_piv

In [58]:
from sklearn.cluster import KMeans

In [59]:
kmeans = KMeans(n_clusters=10, random_state=0).fit(X)
cluster=kmeans.predict(X)

In [60]:
ratings_cluster=ratings_piv.drop(ratings_piv.columns.tolist()[1:], axis=1)

In [61]:
ratings_cluster['cluster']=cluster

In [62]:
ratings_with_cluster=ratings.merge(ratings_cluster, left_on=['userId'], right_on=['userId'],how='left')



In [63]:
ratings_with_cluster.dropna(inplace=True)

In [64]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,tag,genres_tags,userId,rating,timestamp
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar animation disney tom hanks funny compute...,Adventure Animation Children Comedy Fantasy pi...,142225.0,4.0,1111486000.0
1,2,Jumanji (1995),Adventure Children Fantasy,robin williams fantasy time travel board game ...,Adventure Children Fantasy robin williams fant...,142878.0,3.0,1100649000.0
2,3,Grumpier Old Men (1995),Comedy Romance,jack lemmon walter matthau fishing sequel old ...,Comedy Romance jack lemmon walter matthau fish...,140423.0,3.0,866061600.0
3,4,Waiting to Exhale (1995),Comedy Drama Romance,chick flick characters girl movie divorce reve...,Comedy Drama Romance chick flick characters gi...,139959.0,3.0,865275900.0
4,5,Father of the Bride Part II (1995),Comedy,steve martin pregnancy family comedy wedding c...,Comedy steve martin pregnancy family comedy we...,141542.5,3.0,865126100.0


In [65]:
ratings_with_cluster_tags=ratings_with_cluster.merge(movies_with_tags[['movieId','genres_tags' ]], right_on=['movieId'], left_on=['movieId'])

In [66]:
ratings_with_cluster_tags['genres_tags']=[x.lower() for x in ratings_with_cluster_tags['genres_tags'].values]

In [67]:
ratings_with_cluster_tags['cluster']=[str(x) for x in ratings_with_cluster_tags[ratings_with_cluster_tags.columns.tolist()[-2]].values]

In [68]:
ratings_with_cluster_tags['genres_tags']=[x+' ' for x in ratings_with_cluster_tags['genres_tags'].values]

In [69]:
ratings_with_cluster_tags['genres_tags']=ratings_with_cluster_tags['genres_tags']+ratings_with_cluster_tags['cluster']

In [None]:
y=ratings_with_cluster_tags['rating'][:70000]
X_train_counts_genres_tags= count_vect.fit_transform(ratings_with_cluster_tags['genres_tags'][:70000])
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts_genres_tags)
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, y, test_size=0.33, random_state=42)
neigh = KNeighborsRegressor(n_neighbors=20)
neigh.fit(X_train, y_train)
y_pred=neigh.predict(X_test)

In [71]:
mean_squared_error(y_test, y_pred)**(1/2)

0.9819464625013196