## Загружаем данные

In [1]:
import pandas as pd

In [2]:
import numpy as np

In [5]:
df_movies = pd.read_csv("./netology-recsys-master/lecture-1/movies.csv")
df_ratings = pd.read_csv("./netology-recsys-master/lecture-1/ratings.csv")

In [6]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


## Строим рекомендацию на основе KNN

In [8]:
from tqdm import tqdm

In [9]:
tqdm.pandas()

  from pandas import Panel


In [10]:
df = df_movies

In [11]:
df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [12]:
df['genres_splitted'] = df.genres.str.split('|')

In [13]:
df['genres_spaced'] = df.progress_apply(lambda r: ' '.join(r['genres_splitted']), axis=1)

100%|██████████| 9742/9742 [00:00<00:00, 59220.51it/s]


In [14]:
df.head()

Unnamed: 0,movieId,title,genres,genres_splitted,genres_spaced
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[Adventure, Animation, Children, Comedy, Fantasy]",Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[Adventure, Children, Fantasy]",Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,"[Comedy, Romance]",Comedy Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[Comedy, Drama, Romance]",Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy,[Comedy],Comedy


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
tfidf = TfidfVectorizer()

In [17]:
features = tfidf.fit_transform(df['genres_spaced'])

In [18]:
columns = [(k, tfidf.vocabulary_[k]) for k in tfidf.vocabulary_]

In [19]:
columns = sorted(columns, key=lambda c: c[1])

In [20]:
columns = [c[0] for c in columns]

In [21]:
features = features.todense()

In [22]:
df_features = pd.DataFrame(features, columns=columns)

In [23]:
df_result = pd.concat((df, df_features), axis=1)

In [24]:
df_result.columns

Index(['movieId', 'title', 'genres', 'genres_splitted', 'genres_spaced',
       'action', 'adventure', 'animation', 'children', 'comedy', 'crime',
       'documentary', 'drama', 'fantasy', 'fi', 'film', 'genres', 'horror',
       'imax', 'listed', 'musical', 'mystery', 'no', 'noir', 'romance', 'sci',
       'thriller', 'war', 'western'],
      dtype='object')

In [25]:
df_result

Unnamed: 0,movieId,title,genres,genres_splitted,genres_spaced,action,adventure,animation,children,comedy,...,listed,musical,mystery,no,noir,romance,sci,thriller,war,western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[Adventure, Animation, Children, Comedy, Fantasy]",Adventure Animation Children Comedy Fantasy,0.000000,0.416846,0.516225,0.504845,0.267586,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[Adventure, Children, Fantasy]",Adventure Children Fantasy,0.000000,0.512361,0.000000,0.620525,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,"[Comedy, Romance]",Comedy Romance,0.000000,0.000000,0.000000,0.000000,0.570915,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[Comedy, Drama, Romance]",Comedy Drama Romance,0.000000,0.000000,0.000000,0.000000,0.505015,...,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,[Comedy],Comedy,0.000000,0.000000,0.000000,0.000000,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,"[Action, Animation, Comedy, Fantasy]",Action Animation Comedy Fantasy,0.436010,0.000000,0.614603,0.000000,0.318581,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,"[Animation, Comedy, Fantasy]",Animation Comedy Fantasy,0.000000,0.000000,0.682937,0.000000,0.354002,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9739,193585,Flint (2017),Drama,[Drama],Drama,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,"[Action, Animation]",Action Animation,0.578606,0.000000,0.815607,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [26]:
df_result = df_result[[
       'movieId', 'title', 'action', 'adventure', 'animation',
       'children', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'fi',
       'film', 'genres', 'horror', 'imax', 'listed', 'musical', 'mystery',
       'no', 'noir', 'romance', 'sci', 'thriller', 'war', 'western'   
]]

In [27]:
df_result.head()

Unnamed: 0,movieId,title,action,adventure,animation,children,comedy,crime,documentary,drama,...,listed,musical,mystery,no,noir,romance,sci,thriller,war,western
0,1,Toy Story (1995),0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),0.0,0.0,0.0,0.0,0.505015,0.0,0.0,0.466405,...,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
len(df_result)

9742

In [29]:
from sklearn.neighbors import NearestNeighbors

In [30]:
nn = NearestNeighbors(n_neighbors=50)

In [31]:
nn.fit(df_result[['action', 'adventure', 'animation',
       'children', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'fi',
       'film', 'horror', 'imax', 'listed', 'musical', 'mystery',
       'no', 'noir', 'romance', 'sci', 'thriller', 'war', 'western']])

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=50, p=2,
                 radius=1.0)

In [32]:
df_result

Unnamed: 0,movieId,title,action,adventure,animation,children,comedy,crime,documentary,drama,...,listed,musical,mystery,no,noir,romance,sci,thriller,war,western
0,1,Toy Story (1995),0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),0.000000,0.512361,0.000000,0.620525,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),0.000000,0.000000,0.000000,0.000000,0.570915,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),0.000000,0.000000,0.000000,0.000000,0.505015,0.0,0.0,0.466405,...,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),0.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),0.436010,0.000000,0.614603,0.000000,0.318581,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9738,193583,No Game No Life: Zero (2017),0.000000,0.000000,0.682937,0.000000,0.354002,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9739,193585,Flint (2017),0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9740,193587,Bungo Stray Dogs: Dead Apple (2018),0.578606,0.000000,0.815607,0.000000,0.000000,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [33]:
df_joined = df_ratings.merge(df_result, on='movieId')

In [35]:
df_joined.shape

(100836, 30)

In [36]:
df_ratings.shape

(100836, 4)

In [37]:
user_id = 320

In [42]:
nn.kneighbors(mean_vector.reshape(1, -1))

NameError: name 'mean_vector' is not defined

In [43]:
def get_last_seven_films(user_id):
    user_films = df_joined[df_joined['userId'] == user_id]
    user_films = user_films.sort_values('timestamp', ascending=False)
    last_7 = user_films.head(7)
    
    film_names = last_7['title'].values
    film_vectors = last_7[['action', 'adventure', 'animation',
       'children', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'fi',
       'film', 'horror', 'imax', 'listed', 'musical', 'mystery',
       'no', 'noir', 'romance', 'sci', 'thriller', 'war', 'western']].values
    
    return film_names, film_vectors

In [44]:
def get_user_recomendation_by_knn(intereseted_films):
    _, films = nn.kneighbors(mean_vector.reshape(1, -1))
    return df_movies.iloc[films[0]]['movieId'].values
    

In [45]:
names, vectors = get_last_seven_films(user_id)

In [46]:
get_user_recomendation_by_knn(np.mean(vectors, axis=0))

NameError: name 'mean_vector' is not defined

## Делаем SVD

In [145]:
import surprise as s

In [146]:
df_for_surprise = df_ratings[['userId', 'movieId', 'rating']]

In [147]:
df_for_surprise.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [148]:
reader = s.reader.Reader(rating_scale=(0.5, 5))

In [149]:
dataset = s.dataset.Dataset.load_from_df(df_for_surprise, reader)

In [153]:
dataset, _ = s.model_selection.train_test_split(dataset, test_size=0.01)

In [154]:
algorithm = s.SVD()

In [155]:
algorithm.fit(dataset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a346ff588>

## Итоговая рекомендация

In [165]:
def recomend(user_id):
    names, vectors = get_last_seven_films(user_id)
    knn_recomendations = get_user_recomendation_by_knn(np.mean(vectors, axis=0))
    knn_recomendations = pd.DataFrame(knn_recomendations, columns=['movieId'])
    knn_recomendations['Score'] = knn_recomendations.apply(lambda r: algorithm.predict(user_id, r['movieId']).est, axis=1)
    knn_recomendations = knn_recomendations.sort_values('Score', ascending=False)
    knn_recomendations = knn_recomendations.merge(df_movies, on='movieId')[['movieId', 'title', 'Score']]
    
    print("Last 7 films was: ")
    print(names)
    
    print("Recomendations")
    print(knn_recomendations)

In [166]:
recomend(user_id)

Last 7 films was: 
['Avatar (2009)' 'Gladiator (2000)' 'Star Trek (2009)' 'Iron Man (2008)'
 'Cowboy Bebop: The Movie (Cowboy Bebop: Tengoku no Tobira) (2001)'
 'Appleseed (Appurushîdo) (2004)' 'District 13 (Banlieue 13) (2004)']
Recomendations
    movieId                                              title     Score
0       260          Star Wars: Episode IV - A New Hope (1977)  4.057997
1    122918                   Guardians of the Galaxy 2 (2017)  4.013018
2      1196  Star Wars: Episode V - The Empire Strikes Back...  3.902829
3     59315                                    Iron Man (2008)  3.891793
4     34405                                    Serenity (2005)  3.885210
5      1210  Star Wars: Episode VI - Return of the Jedi (1983)  3.873675
6    122912             Avengers: Infinity War - Part I (2018)  3.823201
7    122916                              Thor: Ragnarok (2017)  3.819670
8    111362                  X-Men: Days of Future Past (2014)  3.815303
9    122906              