In [1069]:
import sys
import pandas as pd
import scipy
import numpy as np
from os import path
import re
from scipy.sparse import csr_matrix, dok_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics as skm
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, reciprocal_rank
data_dir = "/home/anakonda/data/latest_ml-100k/ml-latest-small/"

In [61]:
ratings = read_csv("ratings")
movies = read_csv("movies")
tags = read_csv('tags')
movies["movie_id"] = movies["movieId"].astype("category").cat.codes.copy() + 1
last_movie_id = movies["movie_id"].max()
last_user_id = ratings["userId"].max()
ratings = ratings.apply(process_movie_id,axis=1)
tags = tags.apply(process_movie_id,axis = 1)

In [416]:
mean_rating = ratings['rating'].mean()
mean_rating

3.543608255669773

In [684]:
movies.tail()

Unnamed: 0,movieId,title,genres,movie_id,genres_list
9120,162672,Mohenjo Daro (2016),Adventure|Drama|Romance,9121,adventure drama romance
9121,163056,Shin Godzilla (2016),Action|Adventure|Fantasy|Sci-Fi,9122,action adventure fantasy scifi
9122,163949,The Beatles: Eight Days a Week - The Touring Y...,Documentary,9123,documentary
9123,164977,The Gay Desperado (1936),Comedy,9124,comedy
9124,164979,"Women of '69, Unboxed",Documentary,9125,documentary


Приведем теги в единый вид - удалим все спецсимволы, даже проигнорируем слишком длинные теги

In [94]:
tags_v2 = tags.apply(process_tag, axis = 1)

we will ignore this tag: play_enough_video_games_and_you_can_become_an_nsa_agent
we will ignore this tag: interesting_concept__bad_execution
we will ignore this tag: sufficiently_explodey_to_be_good
we will ignore this tag: id_like_to_live_in_this_movie
we will ignore this tag: that_fat_nerd_is_just_annoying
we will ignore this tag: as_historicaly_correct_as_germany_winning_ww2
we will ignore this tag: but_still_a_fun_movie
we will ignore this tag: the_rocks_finest_work_need_i_say_more
we will ignore this tag: try_not_to_mistake_this_for_an_episode_of_alias
we will ignore this tag: why_the_terrorists_hate_us
we will ignore this tag: liked_the_other_two_better
we will ignore this tag: no_desire_to_see_this
we will ignore this tag: i_loved_it_seen_it_five_times_already
we will ignore this tag: based_on_a_tv_show
we will ignore this tag: based_on_a_true_story
we will ignore this tag: villain_nonexistent_or_not_needed_for_good_story
we will ignore this tag: nudity_full_frontal__notable
we 

Unnamed: 0,userId,timestamp,movie_id,tags_list
0,15,1138537770,305,sandraboringbullock
1,15,1193435061,1518,dentist
2,15,1170560997,5167,cambodia
3,15,1170626366,6119,russian
4,15,1141391765,6179,forgettable


In [165]:
tags_v3 = tags_v2.groupby('movie_id')['tags_list'].apply(lambda x: x.sum())
tmp_df = []
for i in tags_v3.index:
    tmp_df.append([i,tags_v3[i]])


In [166]:
tags_v4 = pd.DataFrame(tmp_df,columns=['movie_id','tags'])

In [275]:
tags_v4.count()

movie_id    689
tags        689
dtype: int64

In [177]:
cv = CountVectorizer()
cv.fit(tags_v4["tags"])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [555]:
c = last_movie_id 
tags_dok = dok_matrix((c ,len(cv.vocabulary_)))
tmp = 0
for i in range(c):
    #print(i)
    t = tags_v4[tags_v4['movie_id'] == i]['tags']
    if t.empty: 
        continue;
    t=t.values[0]
    #print("process_tag:{}".format(t))
    for j in t.split(' '):
        j = j.strip()
        if j.isalpha():
            j1 = cv.vocabulary_[j]
            if tags_dok[i,j1]:
                tmp += 1
            tags_dok[i,j1] = 1
            
            #print("add_tag:{}".format(j) )
    

In [556]:
tags_dok

<9125x536 sparse matrix of type '<class 'numpy.float64'>'
	with 979 stored elements in Dictionary Of Keys format>

In [290]:
from  sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import Normalizer

In [557]:
tags_dok.shape

(9125, 536)

In [558]:
tags_normalizer = Normalizer(norm='l1')
tags_dok_n = tags_normalizer.fit_transform(tags_dok)

In [559]:
tags_distance = pairwise_distances(tags_dok_n,metric='cosine')

In [560]:
tags_distance.shape

(9125, 9125)

In [561]:
tags_distance_dok = dok_matrix(tags_distance.shape)

r = 0
for i in tags_distance:
    for j in i.argsort()[:10]:
        if r == j:
            continue
        if i[j] == 1:
            break
        tags_distance_dok[r,j] = i[j]
    r+=1

In [562]:
tag_distance_csr = tags_distance_dok.tocsr()

In [563]:
print(last_movie_id, last_user_id)
ratings.head()

9125 671


Unnamed: 0,userId,rating,timestamp,movie_id
0,1.0,2.5,1260759000.0,31
1,1.0,3.0,1260759000.0,834
2,1.0,3.0,1260759000.0,860
3,1.0,2.0,1260759000.0,907
4,1.0,4.0,1260759000.0,932


### Разреженная матрица user/item

In [565]:
ratings_csr = csr_matrix(
    (
        (((ratings["rating"] - mean_rating) > 0).astype(int) ).tolist(),
            (
                ratings.userId.tolist(),
                ratings.movie_id.tolist()
            )
    ),
    shape=(last_user_id +1 , last_movie_id)
)

In [566]:
ratings_csr[1,932]

1

In [568]:
rating_normalizer = Normalizer()
ratings_csr_n = rating_normalizer.fit_transform(ratings_csr)

### Учитывая количество данных в тегах, рассчитывать на то, что предсказание по ним будет хоть как-то адекватным, не приходится. 

Просто выведем данные по пользователю. Метрики попробуем посчитать позже.

In [1166]:
userId = 3
some_user = ratings_csr_n[userId]
some_user_pred = some_user.dot(tag_distance_csr)
films = some_user_pred.data.argsort()[-10:]
print("Просмотренные фильмы")
watched = []
for i in ratings[ratings['userId'] == userId]['movie_id']:
    watched.append(movies[movies['movie_id']  == i]['title'].values[0])
watched.sort()
print(watched)
print("Рекомендованные фильмы")
recomended = []
for i in films:
    movie_id = some_user_pred.indices[i]
    recomended.append(movies[movies['movie_id']  == movie_id]['title'].values[0])
recomended.sort()
print(recomended)

Просмотренные фильмы
['Aladdin (1992)', 'American Beauty (1999)', 'Batman (1989)', 'Beauty and the Beast (1991)', 'Big Daddy (1999)', 'Bound (1996)', 'Bowling for Columbine (2002)', 'Braveheart (1995)', 'Daria: Is It Fall Yet? (2000)', 'Dark Knight, The (2008)', 'Encino Man (1992)', 'Eternal Sunshine of the Spotless Mind (2004)', 'Fahrenheit 9/11 (2004)', 'Fear and Loathing in Las Vegas (1998)', 'Fight Club (1999)', 'Finding Nemo (2003)', 'Flags of Our Fathers (2006)', 'Flintstones, The (1994)', 'Forrest Gump (1994)', 'Frequency (2000)', 'Fried Green Tomatoes (1991)', 'Ghostbusters (a.k.a. Ghost Busters) (1984)', 'Happiness (1998)', 'Harold and Maude (1971)', 'Heavenly Creatures (1994)', 'Indian in the Cupboard, The (1995)', 'Letters from Iwo Jima (2006)', 'Lord of the Rings: The Return of the King, The (2003)', 'Major Payne (1995)', 'Men in Black (a.k.a. MIB) (1997)', 'Pet Sematary (1989)', 'Princess Bride, The (1987)', 'Pulp Fiction (1994)', 'Requiem for a Dream (2000)', 'Saving Priv

In [1070]:
ratings_csr = csr_matrix(
    (
        (((ratings["rating"] - mean_rating) > 0).astype(int) ).tolist(),
        #ratings["rating"].tolist(),
            (
                ratings.userId.tolist(),
                ratings.movie_id.tolist()
            )
    ),
    shape=(last_user_id + 1, last_movie_id)
)

In [1124]:
# Разобъем на обучающую и тестовые выборки
# В матрице у нас только единицы и нули.
test_indices = np.random.choice(
    range(ratings_csr.nnz),
    replace=False,
    size=int(ratings_csr.nnz * 0.5)
).tolist()

train_csr = ratings_csr.copy()
train_csr.data[test_indices] = 0


test_csr = ratings_csr.copy()
test_csr.data[:] = 0
test_csr.data[test_indices] = ratings_csr.data[test_indices]
test_csr.eliminate_zeros()
train_csr.eliminate_zeros()
train_csr_n = rating_normalizer.fit_transform(train_csr)
print("total: {}, train: {}, test: {}".format(ratings_csr.nnz, train_csr.nnz, test_csr.nnz))


total: 100004, train: 25714, test: 25854


### Метрика precision @ K

In [1177]:
metric_acc = 0
K = 5
for i in range(last_user_id):
    userId = i + 1
    some_user = train_csr_n[userId]
    some_user_pred = some_user.dot(tag_distance_csr)
    pred_v = np.array(some_user_pred.todense().tolist()[0])
    true_v = test_csr[userId]
    metric_acc += my_precision(true_v, pred_v, K)
metric_acc / last_user_id

0.068256333830104376

### Метрика average precision @ K

In [1179]:
metric_acc = 0
K = 5
for i in range(last_user_id):
    userId = i + 1
    some_user = train_csr_n[userId]
    some_user_pred = some_user.dot(tag_distance_csr)
    pred_v = np.array(some_user_pred.todense().tolist()[0])
    true_v = test_csr[userId]
    metric_acc += avg_precision(true_v, pred_v, K)
metric_acc / last_user_id


0.035837059115747624

### Метрика mean reciprocal rank

In [1182]:
metric_acc = 0
for i in range(last_user_id):
    userId = i + 1
    some_user = train_csr_n[userId]
    some_user_pred = some_user.dot(tag_distance_csr)
    pred_v = np.array(some_user_pred.todense().tolist()[0])
    true_v = test_csr[userId]
    metric_acc += my_reciprocal_rank(true_v, pred_v)
metric_acc / last_user_id

0.1782670739342147

In [831]:
t1 = np.array([0,0,1,0,1,1,0,0,0,1,0,1,0,0,0,0])
t2 = np.array([1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1])
skm.accuracy_score(t1,t2,normalize=False)

9

In [1183]:
# обучаем абы какую модель
model = LightFM(loss="bpr")
model.fit(
    train_csr,
    num_threads=4,
    epochs=3,
)
print_model_metrics(model)

MRR: train 0.32, test 0.30.
Precision@5: train 0.18, test 0.15.
Precision@10: train 0.14, test 0.12.
Precision@15: train 0.12, test 0.10.
Precision@20: train 0.11, test 0.10.


In [1186]:
metric_acc = 0
K = 5
for i in range(last_user_id):
    userId = i + 1
    predict = model.predict(userId, np.array(range(last_movie_id)))
    true_v = test_csr[userId]
    metric_acc += avg_precision(true_v, predict, 5)
metric_acc / last_user_id


0.067491306507700022

In [1189]:
userId = 3
predict = model.predict(userId, np.array(range(last_movie_id)))

for i in predict.argsort()[::-1][:10]:
    name = movies[movies['movie_id'] == i]['title'].values[0]
    print("film: {}, relevance: {}".format(name, predict[i]))

film: American Beauty (1999), relevance: -0.13082583248615265
film: Forrest Gump (1994), relevance: -0.13947059214115143
film: Godfather, The (1972), relevance: -0.14498448371887207
film: Silence of the Lambs, The (1991), relevance: -0.15327821671962738
film: Star Wars: Episode IV - A New Hope (1977), relevance: -0.163940891623497
film: Pulp Fiction (1994), relevance: -0.17496216297149658
film: Almost Famous (2000), relevance: -0.19754557311534882
film: Some Like It Hot (1959), relevance: -0.20041882991790771
film: Toy Story (1995), relevance: -0.20245549082756042
film: Matrix, The (1999), relevance: -0.20641151070594788


In [1073]:
def my_precision(true_csr, pred_dense, K):
    indices = pred_dense.argsort()[::-1][:K]
    num_el = K
    #num_el = len(false_csr.indices)
    acc = 0
    for i in indices:
        if i in true_csr.indices:
            acc += 1
    return acc/num_el

In [1032]:
def avg_precision(true_csr, pred_csr, K):
    indices = pred_csr.argsort()[::-1][:K]
    num_el = K
    acc = 0
    j = 1
    for i in indices:
        #print("test_pred_ind{}".format(i))
        if i in true_csr.indices:
            acc += 1 / j
        j += 1
    return acc / num_el

In [1181]:
def my_reciprocal_rank(true_scr, pred_dense):
    indices = pred_dense.argsort()[::-1]
    rv = 0
    j = 1
    for i in indices:
        if i in true_scr.indices:
            return 1 / j
        j += 1
    return 0

In [1074]:
shape = 3
t1 = csr_matrix(
    ( [1,1,1],
        ( 
            [0,0,0],
            [0,1,2],
        )
    ),
    shape = (1, 10),
    dtype = np.int64,
)
                    # 0  1  2   3  4   5  6   7  8  9
predict = np.array([  0, 0.1, 0.2,0,0.6, 0.7,0.7, 0, 0, 0])


In [1078]:
my_precision(t1, predict, 5)

0.4

In [1066]:
avg_precision(t1,predict,4)

0.0625

In [1067]:
reciprocal_rank(t1, predict)

0.25

In [1000]:
t1 = np.array([0,1,2,3,4,5,6,7,8,9])

In [1012]:
t1.argsort()[::-1]

array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])

In [590]:
# добавляем к movie_id ещё и информацию о тэгах
from scipy.sparse import hstack, identity

features = hstack([
    identity(tags_csr.shape[0]),
    tags_csr
])
sparse_info(features)

Размерности матрицы: (9125, 9661)
Ненулевых элементов в матрице: 10104
Доля ненулевых элементов: 0.00011461418809987339
Среднее значение ненулевых элементов: 1.0
Максимальное значение ненулевых элементов: 1.0
Минимальное значение ненулевых элементов: 1.0


In [676]:
# Модель с item-features в виде тегов
model = LightFM(loss="bpr")
model.fit(
    train_csr,
    num_threads=4,
    item_features = features,
    epochs=3,
    
)

train_mrr = reciprocal_rank(model, train_csr, item_features=features).mean()
test_mrr = reciprocal_rank(model, test_csr, item_features=features).mean()
print('MRR: train %.2f, test %.2f.' % (train_mrr, test_mrr))
for k in [5, 10, 15, 20]:
    train_precision = precision_at_k(model, train_csr, item_features=features, k=k).mean()
    test_precision = precision_at_k(model, test_csr, item_features=features, k=k).mean()
    print('Precision@%d: train %.2f, test %.2f.' % (k, train_precision, test_precision))

MRR: train 0.51, test 0.17.
Precision@5: train 0.28, test 0.07.
Precision@10: train 0.23, test 0.07.
Precision@15: train 0.20, test 0.07.
Precision@20: train 0.18, test 0.06.


In [522]:
process_genres(movies.iloc[0]['genres'])

' Adventure Animation Children Comedy Fantasy'

In [536]:
movies['genres_list'] = movies.genres.apply(process_genres)

In [548]:
movies.sort_values('movie_id',inplace=True)

In [538]:
cv_genres = CountVectorizer()
cv_genres.fit(movies['genres_list'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [539]:
cv_genres.vocabulary_

{'action': 0,
 'adventure': 1,
 'animation': 2,
 'children': 3,
 'comedy': 4,
 'crime': 5,
 'documentary': 6,
 'drama': 7,
 'fantasy': 8,
 'filmnoir': 9,
 'horror': 10,
 'imax': 11,
 'musical': 12,
 'mystery': 13,
 'none': 14,
 'romance': 15,
 'scifi': 16,
 'thriller': 17,
 'war': 18,
 'western': 19}

In [581]:
genres_features = cv_genres.transform(movies['genres_list'])

In [582]:
genres_features.shape

(9125, 20)

In [639]:
tags_genres_features = hstack([
    identity(tags_csr.shape[0]),
    tags_csr,
    genres_features,
])
sparse_info(tags_genres_features)

Размерности матрицы: (9125, 9681)
Ненулевых элементов в матрице: 30444
Доля ненулевых элементов: 0.0003446264608122392
Среднее значение ненулевых элементов: 1.0
Максимальное значение ненулевых элементов: 1.0
Минимальное значение ненулевых элементов: 1.0


In [1191]:
# Модель с item-features в виде тегов и жанров
model = LightFM(loss="bpr")
model.fit(
    train_csr,
    num_threads=4,
    item_features = tags_genres_features,
    epochs=3,
    
)

train_mrr = reciprocal_rank(model, train_csr, item_features=tags_genres_features).mean()
test_mrr = reciprocal_rank(model, test_csr, item_features=tags_genres_features).mean()
print('MRR: train %.2f, test %.2f.' % (train_mrr, test_mrr))
for k in [5, 10, 15, 20]:
    train_precision = precision_at_k(model, train_csr, item_features=tags_genres_features, k=k).mean()
    test_precision = precision_at_k(model, test_csr, item_features=tags_genres_features, k=k).mean()
    print('Precision@%d: train %.2f, test %.2f.' % (k, train_precision, test_precision))

MRR: train 0.44, test 0.23.
Precision@5: train 0.18, test 0.09.
Precision@10: train 0.14, test 0.08.
Precision@15: train 0.11, test 0.07.
Precision@20: train 0.10, test 0.07.


In [1192]:
metric_acc = 0
K = 5
for i in range(last_user_id):
    userId = i + 1
    predict = model.predict(userId, np.array(range(last_movie_id)))
    true_v = test_csr[userId]
    metric_acc += avg_precision(true_v, predict, 5)
metric_acc / last_user_id

0.064480874316939954

In [681]:
predict = model.predict(34, np.array(range(last_movie_id)), item_features = tags_genres_features)
for i in predict.argsort()[-20:]:
    name = movies[movies['movie_id'] == i]['title'].values[0]
    print("film: {}, relevance: {}".format(name, predict[i]))

film: Fantasia (1940), relevance: -1.328951120376587
film: Blue Velvet (1986), relevance: -1.325260877609253
film: Body (2015), relevance: -1.3218634128570557
film: General, The (1926), relevance: -1.318421483039856
film: L.A. Confidential (1997), relevance: -1.3178725242614746
film: Witness (1985), relevance: -1.3168854713439941
film: Chinatown (1974), relevance: -1.3052057027816772
film: Tales from the Darkside: The Movie (1990), relevance: -1.2997162342071533
film: Brazil (1985), relevance: -1.2980616092681885
film: View to a Kill, A (1985), relevance: -1.291365623474121
film: Exorcist, The (1973), relevance: -1.2886149883270264
film: Wag the Dog (1997), relevance: -1.2834126949310303
film: Fargo (1996), relevance: -1.2634375095367432
film: Charlie Wilson's War (2007), relevance: -1.2551583051681519
film: Saddest Music in the World, The (2003), relevance: -1.2548884153366089
film: Officer and a Gentleman, An (1982), relevance: -1.2409006357192993
film: Shakespeare in Love (1998), re

In [682]:
len(model.item_embeddings[0])

10

In [602]:
ratings.head()

Unnamed: 0,userId,rating,timestamp,movie_id
0,1.0,2.5,1260759000.0,31
1,1.0,3.0,1260759000.0,834
2,1.0,3.0,1260759000.0,860
3,1.0,2.0,1260759000.0,907
4,1.0,4.0,1260759000.0,932


# Library - все функции описаны здесь

In [540]:
def print_model_metrics(model):
    train_mrr = reciprocal_rank(model, train_csr).mean()
    test_mrr = reciprocal_rank(model, test_csr).mean()
    print('MRR: train %.2f, test %.2f.' % (train_mrr, test_mrr))
    for k in [5, 10, 15, 20]:
        train_precision = precision_at_k(model, train_csr, k=k).mean()
        test_precision = precision_at_k(model, test_csr, k=k).mean()
        print('Precision@%d: train %.2f, test %.2f.' % (k, train_precision, test_precision))

In [541]:
def read_csv(filename: str):
    data = pd.read_csv(path.join(data_dir, filename + ".csv"))
    return data



def sparse_info(sparse_matrix: csr_matrix):
    print("Размерности матрицы: {}".format(sparse_matrix.shape))
    print("Ненулевых элементов в матрице: {}".format(sparse_matrix.nnz))
    print("Доля ненулевых элементов: {}"
          .format(sparse_matrix.nnz / sparse_matrix.shape[0] / sparse_matrix.shape[1])
    )
    print("Среднее значение ненулевых элементов: {}".format(sparse_matrix.data.mean()))
    print("Максимальное значение ненулевых элементов: {}".format(sparse_matrix.data.max()))
    print("Минимальное значение ненулевых элементов: {}".format(sparse_matrix.data.min()))


In [542]:
def process_movie_id(s):
    old_id = s['movieId']
    new_id =  movies[movies['movieId'] == old_id]['movie_id'].values[0]
    new_s = s.copy()
    del new_s['movieId']
    new_s['movie_id'] = new_id
    return new_s

In [543]:
def process_tag(tag_line):
    """
    Обработка тэгов. Правила:
    - Приводим все в нижний регистр
    - Тэги, перечисленные через запятую считаем разными тегами
    - Многословные тэги считаем одинм тегом, но отбрасываем те, что длинее 4 слов,
        так как мы работаем с тэгами, а не с описаниями. 
        Для контроля - выведем то, что отбрасываем в лог.
    - Все спец-символы - удаляем. 

    TODO: можно выделять то, что в скобках как отдельные тэги, 
         отфильтровывая всякий мусор вида 'movie(s)'
    TODO: в тегах есть строки вида r:violence, типа ограничения, какое-то "расширение" тэгов. 
       Пока их мало, обработаем по общим правилам.
    """
    tag = tag_line['tag']
    new_tag_line = tag_line.copy()
    list_of_tags = ""
    for s in tag.split(','):
        s = s.strip()
        s = s.lower()   
        s = re.sub('\s+','_',s)
        s = re.sub('\W','',s)
        
        if (len(s.split('_')) > 4):
            print("we will ignore this tag: {}".format(s))
            #list_of_tags.append('system_none_tags')
            continue  
        s = re.sub('_+','',s)
        
        list_of_tags += ' ' + s
    new_tag_line['tags_list'] = list_of_tags
    del new_tag_line['tag']
    return new_tag_line

In [544]:
def process_genres(genres):
    """
    Обработка жанров:
    * если жанра нет, так и укажем
    * удалим все дефисы

    Функция возвращает список  (list) жанров.
    """
    list_of_genres = ""
    for word in genres.split('|'):
        l = word.replace('(no genres listed)','none').replace('-','')
        l = l.lower()
        list_of_genres += ' ' + l
    return list_of_genres