In [1]:
import json
import numpy as np
import pandas as pd

from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import SVD
from surprise import BaselineOnly
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise import accuracy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# ML Pre processing
from surprise.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Hyperparameter tuning
from surprise.model_selection import GridSearchCV


In [2]:
train = pd.read_json('./ratings.jsonl', lines=True)
df_movies = pd.read_json('./content.jsonl', lines=True)

In [3]:
train.head(5)

Unnamed: 0,UserId,ItemId,Timestamp,Rating
0,c4ca4238a0,91766eac45,2013-10-05 22:00:50,8
1,c81e728d9d,5c739554f7,2013-08-17 16:26:38,9
2,c81e728d9d,48f6d7ce7c,2013-08-17 13:28:27,8
3,c81e728d9d,e9318d627a,2013-06-15 15:38:09,1
4,a87ff679a2,17e6357973,2014-01-31 23:27:59,8


In [4]:
df_movies.head(6)


Unnamed: 0,ItemId,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,...,Type,DVD,BoxOffice,Production,Website,Response,totalSeasons,Season,Episode,seriesID
0,c9f0f895fb,Edison Kinetoscopic Record of a Sneeze,1894,,09 Jan 1894,1 min,"Documentary, Short",William K.L. Dickson,,Fred Ott,...,movie,,,,,True,,,,
1,d3d9446802,Leaving the Factory,1895,Not Rated,22 Mar 1895,1 min,"Documentary, Short",Louis Lumière,,,...,movie,,,,,True,,,,
2,c20ad4d76f,The Arrival of a Train,1896,Not Rated,25 Jan 1896,1 min,"Documentary, Short","Auguste Lumière, Louis Lumière",,"Madeleine Koehler, Marcel Koehler, Mrs. August...",...,movie,,,,,True,,,,
3,8e296a067a,The Oxford and Cambridge University Boat Race,1895,,,,"Short, News, Sport",Birt Acres,,,...,movie,,,,,True,,,,
4,54229abfcf,The House of the Devil,1896,Not Rated,24 Dec 1896,3 min,"Short, Horror",Georges Méliès,Georges Méliès,"Jehanne d'Alcy, Jules-Eugène Legris, Georges M...",...,movie,,,,,True,,,,
5,1afa34a7f9,Une nuit terrible,1896,Not Rated,,1 min,"Short, Comedy, Horror",Georges Méliès,,Georges Méliès,...,movie,,,,,True,,,,


In [5]:
df_train = train.copy()
df_merge1 = df_train.merge(df_movies, on='ItemId')
df_merge1.head()

Unnamed: 0,UserId,ItemId,Timestamp,Rating,Title,Year,Rated,Released,Runtime,Genre,...,Type,DVD,BoxOffice,Production,Website,Response,totalSeasons,Season,Episode,seriesID
0,c4ca4238a0,91766eac45,2013-10-05 22:00:50,8,Species,1995,R,07 Jul 1995,108 min,"Action, Horror, Sci-Fi",...,movie,17 Jul 2001,"$60,074,103",,,True,,,,
1,f0dd4a99fb,91766eac45,2014-03-08 17:37:07,7,Species,1995,R,07 Jul 1995,108 min,"Action, Horror, Sci-Fi",...,movie,17 Jul 2001,"$60,074,103",,,True,,,,
2,e11943a603,91766eac45,2013-06-20 04:34:03,8,Species,1995,R,07 Jul 1995,108 min,"Action, Horror, Sci-Fi",...,movie,17 Jul 2001,"$60,074,103",,,True,,,,
3,a2b15837ed,91766eac45,2017-02-19 15:13:57,7,Species,1995,R,07 Jul 1995,108 min,"Action, Horror, Sci-Fi",...,movie,17 Jul 2001,"$60,074,103",,,True,,,,
4,4cea2358d3,91766eac45,2016-05-02 20:44:55,7,Species,1995,R,07 Jul 1995,108 min,"Action, Horror, Sci-Fi",...,movie,17 Jul 2001,"$60,074,103",,,True,,,,


In [6]:
df_merge1.columns

Index(['UserId', 'ItemId', 'Timestamp', 'Rating', 'Title', 'Year', 'Rated',
       'Released', 'Runtime', 'Genre', 'Director', 'Writer', 'Actors', 'Plot',
       'Language', 'Country', 'Awards', 'Poster', 'Ratings', 'Metascore',
       'imdbRating', 'imdbVotes', 'Type', 'DVD', 'BoxOffice', 'Production',
       'Website', 'Response', 'totalSeasons', 'Season', 'Episode', 'seriesID'],
      dtype='object')

In [7]:
df_merge1.rename(columns={'rating_x': 'Rating', 'rating_y': 'numRatings'}, inplace=True)
num_ratings = pd.DataFrame(df_merge1.groupby('ItemId').count()['Rating']).reset_index()
df_merge1 = pd.merge(left=df_merge1, right=num_ratings, on='ItemId')
df_merge1.rename(columns={'rating_x': 'rating', 'rating_y': 'numRatings'}, inplace=True)

In [8]:
df_1 = df_movies[['ItemId', 'Actors', 'Director', 'Genre', 'Title']]
df_1['Actors'] = df_1.Actors.astype(str)
# df_1['plot_keywords'] = df_1.plot_keywords.astype(str) # get keywords from the 'Plot' field
df_1['Genre'] = df_1.Genre.astype(str)
df_1['Director'] = df_1.Director.astype(str)


# Removing spaces between names
df_1['Director'] = df_1['Director'].apply(lambda x: ' '.join(x.replace(' ', '').lower() for x in x.split(', ')))
df_1['Actors'] = df_1['Actors'].apply(lambda x: ' '.join(x.replace(' ', '').lower() for x in x.split(', ')))

# Discarding the pipes between the plot keywords' and getting only the first five words
# df_1['plot_keywords'] = df_1['plot_keywords'].map(lambda x: x.split('|')) # 'Plot' field but only the keywords
# df_1['plot_keywords'] = df_1['plot_keywords'].apply(lambda x: " ".join(x)) #  'Plot' field but only the keywords

# Discarding the pipes between the genres 
df_1['Genre'] = df_1['Genre'].map(lambda x: x.lower().replace(', ', ' '))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1['Actors'] = df_1.Actors.astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1['Genre'] = df_1.Genre.astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1['Director'] = df_1.Director.astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .lo

In [9]:
df_features = df_1[['Actors', 'Director', 'Genre', 'ItemId']]


indices = {row['ItemId']: index for (index, row) in df_features.iterrows()}


In [10]:
df_features = df_1[['Actors', 'Director', 'Genre', 'ItemId']]
df_1['combined_features'] = df_features['Actors'] +' '+ df_features['Director'] +' '+ df_features['Genre']

cv = CountVectorizer()
# cv_matrix = cv.fit_transform(df_1['combined_features'].sample(frac=.9))
cv_matrix = cv.fit_transform(df_1['combined_features'])
print("================= HERE")

#now we obtain the cosine similarity matrix from the cv matrix
sim_score = cosine_similarity(cv_matrix, cv_matrix)
print("================= HERE2")

# df_1.set_index('ItemId', inplace = True)

# dictionary with ItemId -> Index in the similarity matrix 
indices = {row['ItemId']: index for (index, row) in df_features.iterrows()}

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_1['combined_features'] = df_features['Actors'] +' '+ df_features['Director'] +' '+ df_features['Genre']




In [11]:
def get_similar_by_items(user_rated_items, target_item):
    sims = []
    for item_id in user_rated_items:
        sims.append(sim_score[indices[item_id]][indices[target_item]])

    return pd.DataFrame({'ItemId': user_rated_items, 'Similarity': sims})




def recommendations(title, n, sim_score=sim_score):
    recommended_movies = []

    # gettin the index of the movie that matches the title
    idx = indices[indices == title].index[0]

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(sim_score[idx]).sort_values(ascending = False)

    # getting the indexes of the 10 most similar movies
    top_n_indexes = list(score_series.iloc[1:n+1].index)

    # populating the list with the titles of the best n matching movies
    for i in top_n_indexes:
        recommended_movies.append(list(df_1.index)[i])
        
    return recommended_movies
# recommendations('Species', 15, sim_score)

In [37]:
def get_rated_items(user_id):
    rated_items = df_train[df_train['UserId'] == user_id][['ItemId', 'Rating']]
    return rated_items


# llist = [
# 'c9f0f895fb',
# 'd3d9446802',
# 'c20ad4d76f',
# '8e296a067a',
# '54229abfcf'
# ]

# df_similarity = get_similar_by_items(llist, '1afa34a7f9')
# rated_items = get_rated_items('a87ff679a2')


# rated_items['ItemId'].tolist()

def predict(user_id, item_id):
    related_items = get_rated_items(user_id)
    similarities = get_similar_by_items(related_items['ItemId'].tolist(), item_id)
    merged_df = related_items.merge(similarities, on='ItemId')
    # print(similarities)
    # print(related_items)
    
    if(merged_df.shape[0] != 0):
        user_mean = 0.0
        for (_, row) in merged_df.iterrows():
            user_mean += (row['Rating']*row['Similarity'])

        return user_mean/(merged_df.shape[0])
    else:
        return 0.


# predict('0006246bee','01d2404d4c')

In [41]:
df_to_predict = pd.read_csv('targets.csv', encoding='latin-1', sep=',')
user_index = 1
# user_movies = df_to_predict.iloc[user_index*100: (user_index+1)*100]
# print(user_movies.iloc[0]['UserId'])

output_file = open('out.csv','w')
print('UserId,ItemId', file=output_file)
while((user_index*100)+1 < df_to_predict.shape[0]):
    # break
    user_movies = df_to_predict.iloc[(user_index-1)*100: user_index*100]
    user_index += 1
    user_ranking = []
    uid = user_movies.iloc[0]['UserId']
    for row in user_movies.itertuples():
        user_ranking.append((predict(row.UserId, row.ItemId), row.ItemId))

    user_ranking.sort(reverse=True)
    for(rank, item_id) in user_ranking:
        print('{},{}'.format(uid,item_id), file=output_file)
    # break
# user_movies

KeyboardInterrupt: 

In [None]:
# df_1.loc['Species']
indices.head()

0           Edison Kinetoscopic Record of a Sneeze
1                              Leaving the Factory
2                           The Arrival of a Train
3    The Oxford and Cambridge University Boat Race
4                           The House of the Devil
Name: Title, dtype: object

In [None]:
df_1['Actors'].head()

Title
Edison Kinetoscopic Record of a Sneeze                                                     fredott
Leaving the Factory                                                                            n/a
The Arrival of a Train                           madeleinekoehler marcelkoehler mrs.augustelumiere
The Oxford and Cambridge University Boat Race                                                  n/a
The House of the Devil                              jehanned'alcy jules-eugènelegris georgesméliès
Name: Actors, dtype: object