In [1]:
from os import path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'svg' 
%config InlineBackend.figure_format = 'retina' 

data_dir = "/Users/ur001/Documents/Datasets/ml-latest-small"

In [2]:
import itertools

def read_csv(filename: str):
    data = pd.read_csv(path.join(data_dir, filename + ".csv"))
    return data

def prepare_genres(genres):
    genres = set(genres.lower().replace('-', '').split('|'))
    return list(genres - {'(no genres listed)', 'imax'})

def agregate_tags(tags):
    return list(map(lambda tag: tag.lower().replace('-', ''), tags))

def join_tags_and_genres(row):
    tags = row.tag if isinstance(row.tag, list) else []
    return '|'.join(sorted(list(itertools.chain.from_iterable([row.genres, tags]))))

class EncodedColIndex(object):
    def __init__(self, ids):
        self.idx2id = dict(enumerate(ids))
        self.id2idx = {id: idx for idx, id in self.idx2id.items()}
        
    def __len__(self):
        return len(self.idx2id)

def encode_id_column(id_column):
    id_column = id_column.astype("category")
    return id_column.cat.codes.copy(), EncodedColIndex(id_column.cat.categories)

def load_and_prepare_data():
    movies = read_csv("movies")    
    movies["movie_id"], moive_id_index = encode_id_column(movies.movieId)
    movies = movies.set_index('movie_id')
    movies.genres = movies.genres.apply(prepare_genres)
    
    ratings = read_csv("ratings")
    ratings["user_id"], user_id_index = encode_id_column(ratings.userId)
    ratings["movie_id"] = ratings.movieId.map(moive_id_index.id2idx)
    
    tags = read_csv("tags")
    tags["movie_id"] = tags.movieId.map(moive_id_index.id2idx)
    tags = tags[['movie_id', 'tag']].groupby(by=['movie_id']).agg({'tag': agregate_tags})['tag']
    
    movies = movies.join(tags)
    movies['tags'] = movies.apply(join_tags_and_genres, axis=1)
    return (
        movies[['title', 'tags']], 
        ratings[['movie_id', 'user_id', 'rating']],
        moive_id_index, 
        user_id_index
    )

In [3]:
from sklearn.model_selection import train_test_split

def split_data(data, size=0.2):
    data_train, data_test = train_test_split(data, test_size=size, stratify=data.user_id, random_state=0)
    test_user_set = set(data_test.user_id.unique())
    train_user_set = set(data_train.user_id.unique())
    test_movie_set = set(data_test.movie_id.unique())
    train_movie_set = set(data_train.movie_id.unique())    
    
    # Оставляем только пользователей которые есть одновременно в тестовой и обучающей выборке
    # Так же удаляем из тестовой выборки записи о неизвестных в тренировочной выборке фильмах
    user_ids_to_exclude = (test_user_set - train_user_set).union(train_user_set - test_user_set)
    movie_ids_to_exclude = (test_movie_set - train_movie_set)
    return (
        data_train[~data_train.user_id.isin(user_ids_to_exclude)], 
        data_test[~data_test.user_id.isin(user_ids_to_exclude) & ~data_test.movie_id.isin(movie_ids_to_exclude)]
    )

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack, identity

def get_movie_tags_csr(movies_tags):
    """Векторизует и создаёт разреженную матрицу тегов/жанров для фильмов"""
    return CountVectorizer().fit_transform(movies_tags)

def get_movies_features_csr(movies_tags, movies_count):
    """Возвращает разреженную матрицу фич фильмов: id + теги"""
    return hstack([
        identity(movies_count),
        movies_tags
    ])

In [5]:
def make_user_movie_matrix(ratings, shape, binarize=True):
    """
    Возвращает разреженную матрицу оценок фильмов на основе датафрейма 
    :param pandas.DataFrame ratings: оценки фильмов user_id/movie_id/rating
    :param (int, int) shape размерность: users x movies
    :param bool binarize: бинаризировать рейтинг 1/0 или оставить в виде числа
    """
    if binarize:
        mean_rating = ratings.rating.mean()
        rating = (ratings.rating > mean_rating).tolist()
    else:
        rating = ratings.rating
    
    return csr_matrix((
        rating, 
        (ratings.user_id, ratings.movie_id)
    ), shape=shape, dtype=np.float32)

In [6]:
movies, ratings, moive_id_index, user_id_index = load_and_prepare_data()
movie_titles = dict(enumerate(movies.title))
movies_count = len(moive_id_index)
users_count = len(user_id_index)

In [7]:
movies.head()

Unnamed: 0_level_0,title,tags
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Toy Story (1995),adventure|animation|children|comedy|fantasy|pixar
1,Jumanji (1995),adventure|children|fantasy
2,Grumpier Old Men (1995),comedy|romance
3,Waiting to Exhale (1995),comedy|drama|romance
4,Father of the Bride Part II (1995),comedy|steve martin


In [8]:
ratings.head()

Unnamed: 0,movie_id,user_id,rating
0,30,0,2.5
1,833,0,3.0
2,859,0,3.0
3,906,0,2.0
4,931,0,4.0


In [9]:
list(movie_titles.items())[:10]

[(0, 'Toy Story (1995)'),
 (1, 'Jumanji (1995)'),
 (2, 'Grumpier Old Men (1995)'),
 (3, 'Waiting to Exhale (1995)'),
 (4, 'Father of the Bride Part II (1995)'),
 (5, 'Heat (1995)'),
 (6, 'Sabrina (1995)'),
 (7, 'Tom and Huck (1995)'),
 (8, 'Sudden Death (1995)'),
 (9, 'GoldenEye (1995)')]

In [10]:
from collections import Counter
tags = Counter()
for movie_tags in movies.tags.tolist():
    tags.update(movie_tags.split('|'))
tags.most_common(10)

[('drama', 4370),
 ('comedy', 3328),
 ('thriller', 1732),
 ('romance', 1550),
 ('action', 1550),
 ('adventure', 1118),
 ('crime', 1101),
 ('horror', 880),
 ('scifi', 802),
 ('fantasy', 656)]

In [11]:
movies_tags = get_movie_tags_csr(movies.tags)

In [12]:
print(movies.tags[:1])
print(movies_tags[0].todense())

movie_id
0    adventure|animation|children|comedy|fantasy|pixar
Name: tags, dtype: object
[[0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [13]:
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import dok_matrix, csr_matrix

def multiply_by_rows(matrix, row_coefs):
    """Умножает разреженную матрицу построчно на вектор"""
    normalizer = dok_matrix((len(row_coefs), len(row_coefs)))
    normalizer.setdiag(row_coefs)
    return normalizer.tocsr().dot(matrix)

def multiply_by_columns(matrix, col_coefs):
    """Умножает разреженную матрицу поколоночно на вектор"""
    normalizer = dok_matrix((len(col_coefs), len(col_coefs)))
    normalizer.setdiag(col_coefs)
    return matrix.dot(normalizer.tocsr())

def weight_tags_by_ppmi(movies_tags, cds=1):
    """Рассчёт PPMI для матрицы тегов фильмов"""
    marginal_movies = np.array(movies_tags.sum(axis=1))[:, 0]
    marginal_tags = np.array(movies_tags.sum(axis=0))[0, :]
    marginal_tags = marginal_tags ** cds
    sum_total = marginal_tags.sum()
    
    pmi = movies_tags.copy()
    pmi *= sum_total
    pmi = multiply_by_rows(pmi, np.reciprocal(marginal_movies.clip(1.0)))
    pmi = multiply_by_columns(pmi, np.reciprocal(marginal_tags.clip(1.0)))
    pmi.data = np.log2(pmi.data).clip(0.0)
    return pmi

def make_svd_tags(movies_tags, n_components=35, n_iter=100):
    """Факторизация матрицы тегов"""
    svd = TruncatedSVD(n_components=n_components, n_iter=n_iter, random_state=0)
    return svd.fit_transform(movies_tags)

In [14]:
from sklearn.preprocessing import normalize

def get_similarity_by_tags_matrix(movies_tags, method='ppmi_svd', n_components=35, n_iter=100):
    """
    Возвращает матрицу схожести фильмов на основе матрицы тегов
    :param scipy.sparse.csr.csr_matrix movies_tags: матрица тегов фильмов (movies x tags)
    :param str method:
        - mormalize: косинусное расстояние между тегами фильмов
        - ppmi: косинусное расстояние на матрице тегов взвешенной с помощью ppmi
        - ppmi_svd: косинусное расстояние на факторизованной матрице ppmi
    :param int n_components: чисо компонент при факторизации с помощью svd
    :param int n_iter: чисо итераций при факторизации с помощью svd
    """
    if method in {'ppmi', 'ppmi_svd'}:
        movies_tags = weight_tags_by_ppmi(movies_tags)
        
    if method == 'ppmi_svd':
        movies_tags = make_svd_tags(movies_tags, n_components=n_components, n_iter=n_iter)
        
    movies_tags = normalize(movies_tags)
    return movies_tags.dot(movies_tags.T)

In [15]:
# Получаем матрицу схожести фильмов по тегам и жанрам (факторизация ppmi матрицы)
similarity_by_tags = get_similarity_by_tags_matrix(movies_tags, 'ppmi_svd', 50, 100)

In [16]:
from operator import itemgetter
from scipy.sparse import issparse

def find_movie(value):
    """Ищет фильм по части названия и выводит"""
    value = value.lower()
    for idx, title in movie_titles.items():
        if value in title.lower():
            print(idx, title)
            
def get_similar_by_tags(idx, n=10):
    """Возвращает список n похожих фильмов по посчитанной матрице близости на тегах"""
    sim_vector = similarity_by_tags[idx]
    if issparse(similarity_by_tags):
        sim_vector = sim_vector.todense().tolist()[0]
    return sorted(list(enumerate(sim_vector)), key=itemgetter(1), reverse=True)[:n]

def similarity_do_dataframe(similarity_list):
    """Преобразует список похожих фильмов в DataFrame для удобного вывода"""
    return pd.DataFrame([
        (similarity, movie_titles[idx], movies.loc[idx].tags)
        for idx, similarity in similarity_list
    ], columns=['similarity', 'movie', 'tags'])

In [17]:
find_movie('matrix')

2062 Matrix, The (1999)
4603 Matrix Reloaded, The (2003)
4880 Matrix Revolutions, The (2003)
5922 Animatrix, The (2003)


In [18]:
similarity_do_dataframe(get_similar_by_tags(2062))

Unnamed: 0,similarity,movie,tags
0,1.0,"Matrix, The (1999)",action|philosophy|scifi|scifi|thriller|virtual...
1,0.70862,V for Vendetta (2006),action|dystopia|scifi|thoughtprovoking|thriller
2,0.670196,Inception (2010),action|alternate reality|alternate reality|chr...
3,0.641754,Outbreak (1995),action|drama|scifi|thriller
4,0.641754,"Omega Man, The (1971)",action|drama|scifi|thriller
5,0.641754,Impostor (2002),action|drama|scifi|thriller
6,0.641754,Star Trek: Nemesis (2002),action|drama|scifi|thriller
7,0.641754,"Core, The (2003)",action|drama|scifi|thriller
8,0.641754,Battlestar Galactica: Razor (2007),action|drama|scifi|thriller
9,0.641754,Doomsday (2008),action|drama|scifi|thriller


In [19]:
find_movie('Star Wars')

232 Star Wars: Episode IV - A New Hope (1977)
953 Star Wars: Episode V - The Empire Strikes Back (1980)
966 Star Wars: Episode VI - Return of the Jedi (1983)
2103 Star Wars: Episode I - The Phantom Menace (1999)
4103 Star Wars: Episode II - Attack of the Clones (2002)
6139 Star Wars: Episode III - Revenge of the Sith (2005)
7023 Star Wars: The Clone Wars (2008)
7570 Empire of Dreams: The Story of the 'Star Wars' Trilogy (2004)
8783 Star Wars: Episode VII - The Force Awakens (2015)


In [20]:
similarity_do_dataframe(get_similar_by_tags(232)) # Star Wars: Episode IV - A New Hope (1977)

Unnamed: 0,similarity,movie,tags
0,1.0,Star Wars: Episode IV - A New Hope (1977),action|action|adventure|awesome|awesome soundt...
1,0.756142,Star Wars: Episode VI - Return of the Jedi (1983),action|action|adventure|aliens|george lucas|ha...
2,0.531337,Seven Samurai (Shichinin no samurai) (1954),action|adventure|classic|drama|long
3,0.478899,Star Wars: Episode VII - The Force Awakens (2015),action|adventure|fantasy|scifi|space|star
4,0.44009,Self/less (2015),action|meaning of life|mystery|philosophical|s...
5,0.361783,My Sister's Keeper (2009),abigail breslin|drama|ending|genuine character...
6,0.338544,Fanny and Alexander (Fanny och Alexander) (1982),coming of age|drama|fantasy|funny|mystery
7,0.29943,Sleepers (1996),emotional|revenge|thriller|true story
8,0.292524,Inglorious Bastards (Quel maledetto treno blin...,action|adventure|drama|war|world war ii
9,0.284802,Star Trek (2009),action|adventure|sci fi|scifi|spock|star trek|...


In [21]:
find_movie('ameli')

3856 Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)
7363 Amelia (2009)


In [22]:
similarity_do_dataframe(get_similar_by_tags(3856))

Unnamed: 0,similarity,movie,tags
0,1.0,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ...",beautifully filmed|comedy|comedy|comedy|drama|...
1,0.718776,What About Bob? (1991),bill murray|comedy|quirky
2,0.704917,Kelly & Cal (2014),comedy|drama|drama|indie|love
3,0.670741,Groundhog Day (1993),alternate reality|bill murray|character develo...
4,0.587348,Watchmen (2009),action|alternate reality|diluted version of co...
5,0.555357,Fargo (1996),comedy|crime|drama|overrated|quirky|thriller
6,0.553286,Love Actually (2003),british|christmas|comedy|drama|ensemble cast|k...
7,0.528398,"Fearless Vampire Killers, The (1967)",beautiful woman|campy|comedy|horror|irreverent...
8,0.498305,"Fish Called Wanda, A (1988)",comedy|crime|dark comedy|dark humor|john clees...
9,0.473099,"Grand Budapest Hotel, The (2014)",amazing storytelling|bill murray|cinematograph...


In [23]:
find_movie('Super Mario Bros')

489 Super Mario Bros. (1993)


In [24]:
similarity_do_dataframe(get_similar_by_tags(489))

Unnamed: 0,similarity,movie,tags
0,1.0,Super Mario Bros. (1993),action|adventure|children|comedy|fantasy|scifi
1,0.964087,Star Kid (1997),adventure|children|fantasy|scifi
2,0.964087,Aliens in the Attic (2009),adventure|children|fantasy|scifi
3,0.963742,"Honey, I Shrunk the Kids (1989)",adventure|children|comedy|fantasy|scifi
4,0.932509,Race to Witch Mountain (2009),adventure|children|fantasy|scifi|thriller
5,0.931018,Teenage Mutant Ninja Turtles (1990),action|children|comedy|fantasy|scifi
6,0.889856,*batteries not included (1987),children|comedy|fantasy|scifi
7,0.885258,"Goonies, The (1985)",action|adventure|children|comedy|fantasy
8,0.885258,Teenage Mutant Ninja Turtles III (1993),action|adventure|children|comedy|fantasy
9,0.885258,"Sorcerer's Apprentice, The (2010)",action|adventure|children|comedy|fantasy


In [25]:
# Получаем матрицу рекомендаций для всех пользователей
user_movie_matrix = make_user_movie_matrix(ratings, (users_count, movies_count), binarize=False)
recommendations = user_movie_matrix.dot(similarity_by_tags)

In [26]:
# Делим датасет с оценками на обучающую и тестовую выборки
ratings_train, ratings_test = split_data(ratings)
print("Размер обучающей/тестовой выборки: {}/{}".format(ratings_train.shape[0], ratings_test.shape[0]))
print("Всего пользователей в обучающей/тестовой выборке: {}/{}".format(
    len(ratings_train.user_id.unique()), 
    len(ratings_test.user_id.unique())
))
print("Всего фильмов в обучающей/тестовой выборке: {}/{}".format(
    len(ratings_train.movie_id.unique()), 
    len(ratings_test.movie_id.unique())
))

Размер обучающей/тестовой выборки: 80003/19274
Всего пользователей в обучающей/тестовой выборке: 671/671
Всего фильмов в обучающей/тестовой выборке: 8399/4192


In [27]:
# Преобразуем тестовую и обучающую выборки в матрицу
ratings_train_csr = make_user_movie_matrix(ratings_train, (users_count, movies_count))
ratings_test_csr = make_user_movie_matrix(ratings_test, (users_count, movies_count))

In [28]:
from lightfm import LightFM
from lightfm.evaluation import auc_score

# обучаем абы какую модель
model = LightFM(loss="bpr")
model.fit(ratings_train_csr)

train_score = auc_score(model, ratings_train_csr).mean()
test_score = auc_score(model, ratings_test_csr).mean()
print('ROC AUC: train {:.2f}, test {:.2f}'.format(train_score, test_score))



ROC AUC: train 0.63, test 0.62


In [29]:
# Получаем фичи фильмов для Light.fm (id фмльма + теги/жанры)
movies_features = get_movies_features_csr(movies_tags, movies_count)

In [30]:
# movies_tags2 = movies_tags.copy()
# movies_tags2.data = movies_tags.data * 0 + 1
# movies_features = get_movies_features_csr(movies_tags2, movies_count)

In [45]:
# обучаем модель с жанрами
model = LightFM(loss="bpr", no_components=5)
model.fit(
    ratings_train_csr,
    item_features=movies_features
)

train_score = auc_score(model, ratings_train_csr).mean()
test_score = auc_score(model, ratings_test_csr).mean()
print('ROC AUC: train {:.2f}, test {:.2f}'.format(train_score, test_score))

ROC AUC: train 0.65, test 0.64


### Непонятный кажется баг в Light.fm :( Пока просто закометировал строчку
```python
if not item_features.shape[1] == self.item_embeddings.shape[0]:
    raise ValueError('Incorrect number of features in item_features')
```

In [32]:
movies_features.shape, movies_tags.shape, movies.shape, ratings_train_csr.shape
# ((9125, 9891), (9125, 766), (9125, 2), (671, 9125))

((9125, 9891), (9125, 766), (9125, 2), (671, 9125))

In [36]:
movies_features.shape[1], model.item_embeddings.shape[0]

(9891, 9891)

In [None]:
model.it