In [2]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [3]:
movies = pd.read_csv('movies.dat', sep='::', header=None, names=['movieId', 'title', 'genres'])
ratings = pd.read_csv('ratings.dat', sep='::', header=None, names=['userId', 'movieId', 'rating', 'timestamp'])
users = pd.read_csv('users.dat', sep='::', header=None, names=['userId', 'gender', 'age', 'occupation', 'zipCode'])

  """Entry point for launching an IPython kernel.
  
  This is separate from the ipykernel package so we can avoid doing imports until


In [4]:
movies.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [5]:
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968


In [39]:
#количество уникальных пользователей
len(ratings['userId'].unique())

6040

In [40]:
#количество уникальных фильмов
len(movies['movieId'].unique())

3883

так как пользователей больше, чем фильмов, то user-based подход отработает хуже. значит надо использовать Item-Based метод. Тем не менее попробуем и тот и другой методы.

In [6]:
users.head(3)

Unnamed: 0,userId,gender,age,occupation,zipCode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117


Age:
    *  1:  "Under 18"
	* 18:  "18-24"
	* 25:  "25-34"
	* 35:  "35-44"
	* 45:  "45-49"
	* 50:  "50-55"
	* 56:  "56+"
Occupation:   
    *  0:  "other" or not specified
	*  1:  "academic/educator"
	*  2:  "artist"
	*  3:  "clerical/admin"
	*  4:  "college/grad student"
	*  5:  "customer service"
	*  6:  "doctor/health care"
	*  7:  "executive/managerial"
	*  8:  "farmer"
	*  9:  "homemaker"
	* 10:  "K-12 student"
	* 11:  "lawyer"
	* 12:  "programmer"
	* 13:  "retired"
	* 14:  "sales/marketing"
	* 15:  "scientist"
	* 16:  "self-employed"
	* 17:  "technician/engineer"
	* 18:  "tradesman/craftsman"
	* 19:  "unemployed"
	* 20:  "writer"


In [29]:
from surprise import KNNWithMeans, KNNBasic
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from surprise import SlopeOne

In [9]:
#так как у нас User-based модель, то нужны только userId, Title, rating
with_ratings =  pd.merge(movies, ratings)

In [10]:
with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Animation|Children's|Comedy,1,5,978824268
1,1,Toy Story (1995),Animation|Children's|Comedy,6,4,978237008
2,1,Toy Story (1995),Animation|Children's|Comedy,8,4,978233496
3,1,Toy Story (1995),Animation|Children's|Comedy,9,5,978225952
4,1,Toy Story (1995),Animation|Children's|Comedy,10,5,978226474


In [12]:
df = with_ratings[['userId', 'title', 'rating']]
df.head()

Unnamed: 0,userId,title,rating
0,1,Toy Story (1995),5
1,6,Toy Story (1995),4
2,8,Toy Story (1995),4
3,9,Toy Story (1995),5
4,10,Toy Story (1995),5


In [14]:
min_score = df['rating'].min()
max_score = df['rating'].max()

In [15]:
print('Минимальная оценка: ', min_score, ' , Максимальная оценка: ', max_score)

Минимальная оценка:  1  , Максимальная оценка:  5


In [18]:
df['rating'].value_counts()

4    348971
3    261197
5    226310
2    107557
1     56174
Name: rating, dtype: int64

In [19]:
reader = Reader(rating_scale=(min_score, max_score))
data = Dataset.load_from_df(df, reader)

In [22]:
train_set, test_set = train_test_split(data, test_size=.3)

Сначала применим более плохой для нашей ситуации подход user-based. в результате RMSE=0,89

In [23]:
knn = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
knn.fit(train_set)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1652019be88>

In [24]:
test_pred = knn.test(test_set)

In [26]:
accuracy.rmse(test_pred, verbose=True)

RMSE: 0.8944


0.8943918123871125

In [32]:
slope = SlopeOne()
slope.fit(train_set)

<surprise.prediction_algorithms.slope_one.SlopeOne at 0x1652f7e3648>

In [34]:
test_pred_sl = slope.test(test_set)

In [35]:
accuracy.rmse(test_pred_sl, verbose=True)

RMSE: 0.9081


0.9080663190077798

Теперь применим более подходящий подход item-based, который должен выдать более подходящий результат. Действително, видим, что результат по RMSE значительно лучше = 0,86, чем при user-based (0,89)

In [41]:
knn_ib = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': False})
knn_ib.fit(train_set)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1653ed9ed48>

In [42]:
test_pred_ib = knn_ib.test(test_set)

In [43]:
accuracy.rmse(test_pred_ib, verbose=True)

RMSE: 0.8663


0.8662964818588397