In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

In [2]:
movies = pd.read_csv('C:\\jupyter\\recommended system\\ml-latest-small\\movies.csv')
movies.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [3]:
ratings = pd.read_csv('C:\\jupyter\\recommended system\\ml-latest-small\\ratings.csv')
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [4]:
tags = pd.read_csv('C:\\jupyter\\recommended system\\ml-latest-small\\tags.csv')
tags.head(3)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992


#### TF-IDF на тегах и жанрах

#### Задача: Найти похожие фильмы по жанрам и тегам при использовании метрики tf-idf

In [5]:
#группируем теги
tags_grouped = tags.groupby('movieId').agg(' '.join)
tags_grouped.head(4)

Unnamed: 0_level_0,tag
movieId,Unnamed: 1_level_1
1,pixar pixar fun
2,fantasy magic board game Robin Williams game
3,moldy old
5,pregnancy remake


**Поскольку не для всех фильмов проставлены тэги, то получится построить модель только на тех у кого они прописаны**

In [6]:
#объединяем с жанрами
tags_genres_title = tags_grouped.merge(movies, on = 'movieId')
tags_genres_title.head(3)

Unnamed: 0,movieId,tag,title,genres
0,1,pixar pixar fun,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,fantasy magic board game Robin Williams game,Jumanji (1995),Adventure|Children|Fantasy
2,3,moldy old,Grumpier Old Men (1995),Comedy|Romance


In [7]:
def change_separator(text):
    return text.replace('|', ' ')

In [8]:
# меняем разделитель у жанров
tags_genres_title['genres'] = tags_genres_title['genres'].apply(change_separator)
tags_genres_title.head(4)

Unnamed: 0,movieId,tag,title,genres
0,1,pixar pixar fun,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,fantasy magic board game Robin Williams game,Jumanji (1995),Adventure Children Fantasy
2,3,moldy old,Grumpier Old Men (1995),Comedy Romance
3,5,pregnancy remake,Father of the Bride Part II (1995),Comedy


In [9]:
#объединяем тэги и жанры в одну колонку
tags_genres_title['tag_genres'] = tags_genres_title['tag'].str.lower() + ' ' + tags_genres_title['genres'].str.lower()
tags_genres_title.head(4)

Unnamed: 0,movieId,tag,title,genres,tag_genres
0,1,pixar pixar fun,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar pixar fun adventure animation children c...
1,2,fantasy magic board game Robin Williams game,Jumanji (1995),Adventure Children Fantasy,fantasy magic board game robin williams game a...
2,3,moldy old,Grumpier Old Men (1995),Comedy Romance,moldy old comedy romance
3,5,pregnancy remake,Father of the Bride Part II (1995),Comedy,pregnancy remake comedy


In [10]:
#создаем список того что будем использовать для метрики
list_of_title_and_genres = tags_genres_title['tag_genres'].tolist()
list_of_title_and_genres[:5]

['pixar pixar fun adventure animation children comedy fantasy',
 'fantasy magic board game robin williams game adventure children fantasy',
 'moldy old comedy romance',
 'pregnancy remake comedy',
 'remake comedy romance']

In [11]:
# векторизируем
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(list_of_title_and_genres)

In [12]:
#считаем tf-idf
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [50]:
X_train_tfidf[0]

<1x1748 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [13]:
#строим модель
neigh = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='manhattan') 
neigh.fit(X_train_tfidf)

NearestNeighbors(metric='manhattan', n_jobs=-1, n_neighbors=10)

In [14]:
#тестируем модель
request_to_find = 'pixar pixar fun'

predict = count_vect.transform([request_to_find])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [15]:
#результаты, в res[0] - расстояния от заданных векторов
# res[1] - это номера строк в датафрейме tags_genres_title
res 

(array([[1.11433492, 1.96770388, 2.33236563, 2.33236563, 2.33236563,
         2.51302779, 2.51302779, 2.51302779, 2.51302779, 2.51302779]]),
 array([[   0,  544, 1480,  105,  673,  716, 1100,  755,  780,  272]],
       dtype=int64))

In [16]:
tags_genres_title

Unnamed: 0,movieId,tag,title,genres,tag_genres
0,1,pixar pixar fun,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar pixar fun adventure animation children c...
1,2,fantasy magic board game Robin Williams game,Jumanji (1995),Adventure Children Fantasy,fantasy magic board game robin williams game a...
2,3,moldy old,Grumpier Old Men (1995),Comedy Romance,moldy old comedy romance
3,5,pregnancy remake,Father of the Bride Part II (1995),Comedy,pregnancy remake comedy
4,7,remake,Sabrina (1995),Comedy Romance,remake comedy romance
...,...,...,...,...,...
1567,183611,Comedy funny Rachel McAdams,Game Night (2018),Action Comedy Crime Horror,comedy funny rachel mcadams action comedy crim...
1568,184471,adventure Alicia Vikander video game adaptation,Tomb Raider (2018),Action Adventure Fantasy,adventure alicia vikander video game adaptatio...
1569,187593,Josh Brolin Ryan Reynolds sarcasm,Deadpool 2 (2018),Action Comedy Sci-Fi,josh brolin ryan reynolds sarcasm action comed...
1570,187595,Emilia Clarke star wars,Solo: A Star Wars Story (2018),Action Adventure Children Sci-Fi,emilia clarke star wars action adventure child...


In [17]:
print(f'наиболее похожие фильмы по запросу: {request_to_find} \n')
for i in res[1][0]:
    print(tags_genres_title.iloc[i][2])

наиболее похожие фильмы по запросу: pixar pixar fun 

Toy Story (1995)
Bug's Life, A (1998)
Invincible Iron Man, The (2007)
Airheads (1994)
Magnolia (1999)
Hustler, The (1961)
Passion of Joan of Arc, The (Passion de Jeanne d'Arc, La) (1928)
Steel Magnolias (1989)
Friendly Persuasion (1956)
On Golden Pond (1981)


### По оценкам пользователя

In [18]:
ratings.nunique()

userId         610
movieId       9724
rating          10
timestamp    85043
dtype: int64

**Для каждого пользователя считаем его средний рейтинг по всем фильмам.**

In [19]:
user_rating = ratings.drop(['movieId','timestamp'], axis = 1).groupby('userId').agg('mean')
user_rating.rename(columns ={'rating': 'avg user rating'}, inplace = True)
user_rating

Unnamed: 0_level_0,avg user rating
userId,Unnamed: 1_level_1
1,4.366379
2,3.948276
3,2.435897
4,3.555556
5,3.636364
...,...
606,3.657399
607,3.786096
608,3.134176
609,3.270270


**Для каждого фильма считаем средний рейтинг по всем пользователям**

In [20]:
movie_rating = ratings.drop(['userId', 'timestamp'], axis = 1).groupby('movieId').agg('mean')
movie_rating.rename(columns ={'rating': 'avg movie rating'}, inplace = True)
movie_rating

Unnamed: 0_level_0,avg movie rating
movieId,Unnamed: 1_level_1
1,3.920930
2,3.431818
3,3.259615
4,2.357143
5,3.071429
...,...
193581,4.000000
193583,3.500000
193585,3.500000
193587,3.500000


In [21]:
df_with_avg_rating = ratings.merge(user_rating, how = 'outer', on = 'userId').merge(movie_rating, how = 'outer', on = 'movieId')
df_with_avg_rating

Unnamed: 0,userId,movieId,rating,timestamp,avg user rating,avg movie rating
0,1,1,4.0,964982703,4.366379,3.92093
1,5,1,4.0,847434962,3.636364,3.92093
2,7,1,4.5,1106635946,3.230263,3.92093
3,15,1,2.5,1510577970,3.448148,3.92093
4,17,1,4.5,1305696483,4.209524,3.92093
...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,3.688556,2.50000
100832,610,160527,4.5,1479544998,3.688556,4.50000
100833,610,160836,3.0,1493844794,3.688556,3.00000
100834,610,163937,3.5,1493848789,3.688556,3.50000


In [22]:
df = df_with_avg_rating.merge(tags_genres_title, how = 'inner', on = 'movieId')
df.drop(['timestamp', 'tag', 'genres'], axis = 1, inplace = True)
df

Unnamed: 0,userId,movieId,rating,avg user rating,avg movie rating,title,tag_genres
0,1,1,4.0,4.366379,3.92093,Toy Story (1995),pixar pixar fun adventure animation children c...
1,5,1,4.0,3.636364,3.92093,Toy Story (1995),pixar pixar fun adventure animation children c...
2,7,1,4.5,3.230263,3.92093,Toy Story (1995),pixar pixar fun adventure animation children c...
3,15,1,2.5,3.448148,3.92093,Toy Story (1995),pixar pixar fun adventure animation children c...
4,17,1,4.5,4.209524,3.92093,Toy Story (1995),pixar pixar fun adventure animation children c...
...,...,...,...,...,...,...,...
48282,567,176419,3.0,2.245455,3.25000,Mother! (2017),allegorical uncomfortable unsettling drama hor...
48283,599,176419,3.5,2.642050,3.25000,Mother! (2017),allegorical uncomfortable unsettling drama hor...
48284,594,7023,4.5,3.924569,4.50000,"Wedding Banquet, The (Xi yan) (1993)",in netflix queue comedy drama romance
48285,606,6107,4.0,3.657399,4.00000,Night of the Shooting Stars (Notte di San Lore...,world war ii drama war


In [23]:
df.nunique()

userId               610
movieId             1554
rating                10
avg user rating      549
avg movie rating     752
title               1554
tag_genres          1387
dtype: int64

In [139]:
#Найдем пользователя с самым большим количеством оценок
df.groupby('userId').agg('count').sort_values('rating', ascending = False).head(3)

Unnamed: 0_level_0,movieId,rating,avg user rating,avg movie rating,title,tag_genres
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
474,1198,1198,1198,1198,1198,1198
414,827,827,827,827,827,827
599,685,685,685,685,685,685


In [140]:
df_oneuser= df[df['userId'] == 474].copy()
df_oneuser.reset_index(inplace = True)
df_oneuser.head(3)

Unnamed: 0,index,userId,movieId,rating,avg user rating,avg movie rating,title,tag_genres
0,164,474,1,4.0,3.398956,3.92093,Toy Story (1995),pixar pixar fun adventure animation children c...
1,432,474,47,4.0,3.398956,3.975369,Seven (a.k.a. Se7en) (1995),mystery twist ending serial killer mystery thr...
2,635,474,50,4.0,3.398956,4.237745,"Usual Suspects, The (1995)",mindfuck suspense thriller tricky twist ending...


In [141]:
%%time
#создаем список того что будем использовать для метрики
list_of_title_and_genres_oneuser = df_oneuser['tag_genres'].tolist()
# векторизируем
count_vect = CountVectorizer()
X_train_counts_oneuser = count_vect.fit_transform(list_of_title_and_genres_oneuser)
#считаем tf-idf
tfidf_transformer = TfidfTransformer()
X_train_tfidf_oneuser = tfidf_transformer.fit_transform(X_train_counts_oneuser)

Wall time: 24.5 ms


In [147]:
#создаем общую таблицу
tags_genres_sparse = pd.DataFrame(X_train_tfidf_oneuser.toarray())
df_oneuser_tgs = df_oneuser.join(tags_genres_sparse)
df_oneuser_tgs.head(3)

Unnamed: 0,index,userId,movieId,rating,avg user rating,avg movie rating,title,tag_genres,0,1,...,1290,1291,1292,1293,1294,1295,1296,1297,1298,1299
0,164,474,1,4.0,3.398956,3.92093,Toy Story (1995),pixar pixar fun adventure animation children c...,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,432,474,47,4.0,3.398956,3.975369,Seven (a.k.a. Se7en) (1995),mystery twist ending serial killer mystery thr...,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,635,474,50,4.0,3.398956,4.237745,"Usual Suspects, The (1995)",mindfuck suspense thriller tricky twist ending...,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [148]:
from sklearn.linear_model import LinearRegression

In [163]:
reg = LinearRegression()

In [164]:
reg.fit(df_oneuser_tgs.drop(['index', 'userId', 'movieId', 'rating', 'title', 'tag_genres'], axis = 1),\
        df_oneuser_tgs['rating'])

LinearRegression()

In [215]:
#тестируем модель
#текст для предсказания
text_to_predict_rating_ttp = 'horrible movie thriller'
#получаем tfidfматрицу
predict_ttp = count_vect.transform([text_to_predict_rating_ttp])
tfidf_ttp = tfidf_transformer.transform(predict_ttp)
#собираем такие же переменные которые использовали для обучения
avg_user_rating = df_oneuser['avg user rating'][0]
avg_movie_rating = df_oneuser['avg movie rating'][0]
data = {'avg_user_rating':avg_user_rating, 'avg_movie_rating':avg_movie_rating}
ttp_variable = pd.DataFrame(data = d, index = [0]).join(pd.DataFrame(tfidf_ttp.toarray()))

#получаем предикт
print(f'пользователь userId = 474 \nпоставит фильму с описанием {text_to_predict_rating_ttp} \nвероятнее всего рейтинг =  {reg.predict(ttp_variable)}')

пользователь userId = 474 
поставит фильму с описанием horrible movie thriller 
вероятнее всего рейтинг =  [3.2109375]
