# Imports


In [3]:
import pandas as pd
import plotly.express as px
from sklearn.feature_extraction.text import TfidfVectorizer
import wikipedia
import numpy as np
from tqdm.notebook import tqdm
from tabulate import tabulate
import re
import os

from sklearn.metrics.pairwise import linear_kernel
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import accuracy
from surprise import KNNBasic
from surprise import NormalPredictor
from surprise.model_selection import train_test_split
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_selection import VarianceThreshold
from surprise.model_selection import LeaveOneOut
import requests
from io import BytesIO
import nltk
from nltk.tag.stanford import StanfordNERTagger
import plotly.io as pio
import ipyplot

In [4]:
BASE_FOLDER = '..'
pio.renderers.default = "notebook_connected"
tqdm.pandas()



# EDA


3 Datasets, movies.dat, ratings.dat, users.dat, that are needed to be explored

## Movies

In [5]:
movies_df = pd.read_csv(
    f"{BASE_FOLDER}/data/raw/movies.dat",
    delimiter="::",
    engine="python",
    header=None,
    names=["movie_name", "genre"],
    encoding="ISO-8859-1"
)
movies_df.head()

Unnamed: 0,movie_name,genre
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama
5,Father of the Bride Part II (1995),Comedy


Check the distribution of movies by genres. Explode genres column, turn it to a list, hence some movies will be listed twice since they have multiple genres

In [6]:
movies_df['genre'] = movies_df['genre'].apply(lambda x: x.split('|'))
movies_df_exploded = movies_df.explode('genre')
px.histogram(movies_df_exploded, x='genre', height=400, title='Movie count by genre').update_xaxes(categoryorder="total descending")


Dramas and Comedies are top movies genres, while westerns and noirs are for small groups of people. Drama can be western and noir, but not every western and noir is a drama or comedy. 

In [7]:
movies_df['year'] = movies_df['movie_name'].apply(lambda movie_name: re.search('\((\d*)\)', movie_name).groups(1)[0])
movie_count_by_year = px.histogram(movies_df, x='year', height=400, title='Movie count by year').update_xaxes(categoryorder="total descending")
movie_count_by_year

Obviously, the recent the year is - the more movies was produced due to progress

## Users

Lets take a look at occupations 

In [8]:
readme_text = np.array(open(f'{BASE_FOLDER}/data/raw/README').read().splitlines())
start_index = np.flatnonzero(np.core.defchararray.find(readme_text,'Occupation is chosen')!=-1)[0]
end_index = np.flatnonzero(np.core.defchararray.find(readme_text,'MOVIES FILE DESCRIPTION')!=-1)[0]
occupation_list = [x.split('"')[1] for x in readme_text[start_index:end_index][2:-1].tolist()]
occupation_dict = dict(zip(range(len(occupation_list)), occupation_list))

users_df = pd.read_csv(f'{BASE_FOLDER}/data/raw/users.dat',
                       delimiter='::', engine='python', header=None,
                       names=['user_id', 'gender', 'age', 'occupation', 'zip_code'])
users_df['occupation'] = users_df['occupation'].replace(occupation_dict)
users_df.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,K-12 student,48067
1,2,M,56,self-employed,70072
2,3,M,25,scientist,55117
3,4,M,45,executive/managerial,2460
4,5,M,25,writer,55455


## Ratings 

In [9]:
ratings_df = pd.read_csv(f'{BASE_FOLDER}/data/raw/ratings.dat',
                         delimiter='::', engine='python', header=None,
                         names=['user_id', 'movie_id', 'rating', 'time'])
ratings_df.head()

Unnamed: 0,user_id,movie_id,rating,time
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


Rank movie genres by their average rating

In [10]:
rating_by_genre_df = ratings_df.join(movies_df_exploded, on='movie_id').\
            groupby('genre').agg({'rating': ['mean', 'count']}).sort_values(('rating', 'mean')).reset_index()
rating_by_genre_df.columns = ['_'.join(col).strip() for col in rating_by_genre_df.columns.values]
px.bar(rating_by_genre_df, x='genre_', y='rating_mean', height=300)

It seems that people that are watching noirs are appreciating them the most since it is most rare genre...

## Combine

Check the differences between male and female ratings

In [11]:
combined_ratings_df = pd.merge(pd.merge(movies_df_exploded.rename_axis('movie_id'), ratings_df, on='movie_id'), users_df, on='user_id')
combined_ratings_data = combined_ratings_df.groupby(['genre', 'gender']).agg({'rating': ['mean', 'count']}).reset_index()
combined_ratings_data.columns = [' '.join(col).strip() for col in combined_ratings_data.columns.values]

combined_ratings_data.loc[combined_ratings_data['gender'] == 'F', 'rating count'] /= len(combined_ratings_df[combined_ratings_df['gender'] == 'F'])
combined_ratings_data.loc[combined_ratings_data['gender'] == 'M', 'rating count'] /= len(combined_ratings_df[combined_ratings_df['gender'] == 'M'])

px.bar(combined_ratings_data, x='genre', y='rating count', color='gender', barmode='group')


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '[0.09022863 0.05402254 0.02415518 0.0421337  0.19028258 0.03249812
 0.00383447 0.19400242 0.01723139 0.00830538 0.02892653 0.02669305
 0.01971787 0.09941356 0.05415694 0.07967    0.02785525 0.0068724 ]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.



# Preprocessing


We will get data from the movies using wikipedia API (python package)

In [12]:
def get_wikipedia_page_name(raw_name):
    names = wikipedia.search(raw_name)
    if len(names) == 0:
      return ''
    else:
      return names[0]

def get_movie_plot(page_name):
    try:
      try:
        movie_page_content = str(wikipedia.page(page_name, auto_suggest=False).content)
      except wikipedia.DisambiguationError as e:
        for option in e.options:
          if 'film' in option:
            movie_page_content = str(wikipedia.page(option, auto_suggest=False).content)
        return ''
    except (wikipedia.PageError, KeyError):
      return ''
    re_groups = re.search("Plot ==(.*?)=+ [A-Z]", str(movie_page_content).replace('\n', ''))
    if re_groups:
      return re_groups.group(1)
    else:
      return ''

movies_df['wikipedia_page_name'] = movies_df['movie_name'].progress_apply(lambda name: get_wikipedia_page_name(name))
movies_df['movie_plot'] = movies_df['wikipedia_page_name'].progress_apply(lambda page_name: get_movie_plot(page_name))
print(f'There are {movies_df["movie_plot"].isna().sum()} NaN movie plots')
movies_df[['movie_name', 'movie_plot']].head()

  0%|          | 0/3883 [00:00<?, ?it/s]


KeyboardInterrupt



Around 600 movies didn't have any plot on Wikipedia, this is not a big deal, so we will simply drop them out

In [None]:
movies_df = movies_df[~pd.isna(movies_df['movie_plot'])]
ratings_df = ratings_df[ratings_df['movie_id'].isin(movies_df.index)]

# Recommendation problem

Lets select test user that we will conduct our stdies on and then evaluate them

## Intro

In [15]:
users_df.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,K-12 student,48067
1,2,M,56,self-employed,70072
2,3,M,25,scientist,55117
3,4,M,45,executive/managerial,2460
4,5,M,25,writer,55455


In [23]:
users_df.iloc[0,]

user_id                  1
gender                   F
age                      1
occupation    K-12 student
zip_code             48067
Name: 0, dtype: object

We will also compare one movie during all reccomendation experiments and see what are the closest movies to it in embeddings.

Let's conduc two dataset splits: 

1. Standard: used for rating prediction
2. Leave-one-out cross validation split: used for hit-rate prediction, hit-rate is a suitable metric to evaluate effectiveness and accuracy of the system. Take one movie per user out of dataset and then use them as eval dataset. All remaining user's movies will be trainset, then hit-rate will be high each time top recommendated movie will be this one we lefr out.

In [12]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_df[['user_id', 'movie_id', 'rating']], reader)

unique_users = ratings_df['user_id'].unique()
selected_users = np.random.choice(unique_users, size=len(unique_users) // 2, replace=False)
filtered_ratings_df = ratings_df[ratings_df['user_id'].isin(selected_users)]
filtered_data = Dataset.load_from_df(filtered_ratings_df[['user_id', 'movie_id', 'rating']], reader)

trainset, testset = train_test_split(filtered_data, test_size=.25)
LOOCV = LeaveOneOut(n_splits=1, random_state=1)
train_loocv, test_loocv = list(LOOCV.split(filtered_data))[0]

For each method we will calculate:

1. RMSE on test set
2. hit-rate on leave-one-out cross validation set
3. Top predictions for user #1 (case study)
4. Top-10 most similar movies to Toy Story (1995)

## Functions and Tools


In [13]:
from collections import defaultdict

def GetTopN(predictions, n=10, minimumRating=4.0):
  topN = defaultdict(list)

  for userID, movieID, actualRating, estimatedRating, _ in predictions:
    if (estimatedRating >= minimumRating):
        topN[userID].append((movieID, estimatedRating))

  for userID, ratings in topN.items():
    ratings.sort(key=lambda x: x[1], reverse=True)
    topN[userID] = ratings[:n]

  return topN

def HitRate(topNPredicted, leftOutPredictions):
  hits = 0
  total = 0

  # For each left-out rating
  for leftOut in leftOutPredictions:
    userID = leftOut[0]
    leftOutMovieID = leftOut[1]
    # Is it in the predicted top 10 for this user?
    hit = False
    for movieID, predictedRating in topNPredicted[userID]:
      if leftOutMovieID == movieID:
        hit = True
        break
    if (hit) :
      hits += 1

    total += 1

  # Compute overall precision
  return hits/total

def get_hitrate_results(algo, train_loocv, test_loocv):
  algo.fit(train_loocv)
  left_out_predictions = algo.test(test_loocv)
  loocv_anti_testset = train_loocv.build_anti_testset()
  all_predictions = algo.test(loocv_anti_testset)
  top_n_predicted = GetTopN(all_predictions)
  hitrate = HitRate(top_n_predicted, left_out_predictions)
  print(f'HitRate: {hitrate}')
  return all_predictions

def get_algo_results(algo, trainset, testset):
  algo.fit(trainset)
  predictions = algo.test(testset)
  accuracy.rmse(predictions)


def get_most_similar_movies(movies_df, movie_embeddings, trainset, target_movie_id, top_k=10):
    inner_movie_id = trainset.to_inner_iid(target_movie_id)
    sims = cosine_similarity(movie_embeddings, movie_embeddings)
    target_movie_sims_sorted = [trainset.to_raw_iid(x) for x in np.argsort(sims[inner_movie_id])[::-1]]
    most_similar_movies = movies_df.loc[target_movie_sims_sorted].iloc[:top_k]
    return most_similar_movies


def filter_predictions_for_user(predictions, user_id, movies_df, top_k=10):
    top_preds = sorted([pred for pred in predictions if pred.uid == user_id], key=lambda pred: pred.est, reverse=True)[:top_k]
    movie_ids = [pred.iid for pred in top_preds]
    relevant_movies = movies_df.loc[movie_ids]
    relevant_movies['rating'] = [pred.est for pred in top_preds]
    return relevant_movies


def get_algorithm_report(algo_class, trainset, testset, train_loocv, test_loocv, movies_df, target_movie_id=1, target_user_id=1, top_k=10, algo_args=[], algo_kwargs={}, calc_most_similar=True):
    algo_inst = algo_class(*algo_args, **algo_kwargs)
    get_algo_results(algo_inst, trainset, testset)
    algo_inst_for_hitrate = algo_class(*algo_args, **algo_kwargs)
    all_predictions = get_hitrate_results(algo_inst_for_hitrate, train_loocv, test_loocv)
    if calc_most_similar:
        if hasattr(algo_inst_for_hitrate, 'qi'):
            sims = algo_inst_for_hitrate.qi
        else:
            sims = algo_inst_for_hitrate.sim
        most_similar_movies = get_most_similar_movies(movies_df, sims, train_loocv, target_movie_id, top_k=top_k);
        print(f'Most similar movies to {movies_df.loc[target_movie_id].movie_name}:')
        print(tabulate(most_similar_movies.head(top_k)[['movie_name', 'genre']], headers='keys'))
    predictions_for_user = filter_predictions_for_user(all_predictions, target_user_id, movies_df)
    print(f'Top predictions for user {target_user_id}:')
    print(tabulate(predictions_for_user.head(top_k)[['movie_name', 'genre']], headers='keys'))


class SVDWithTqdm(SVD):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def test(self, testset, verbose=False):
        # The ratings are translated back to their original scale.
        predictions = [self.predict(uid,
                                    iid,
                                    r_ui_trans,
                                    verbose=verbose)
                        for (uid, iid, r_ui_trans) in tqdm(testset, desc='making predictions')]
        return predictions


class KNNBasicWithTqdm(KNNBasic):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    def test(self, testset, verbose=False):
        # The ratings are translated back to their original scale.
        predictions = [self.predict(uid,
                                    iid,
                                    r_ui_trans,
                                    verbose=verbose)
                        for (uid, iid, r_ui_trans) in tqdm(testset, desc='making predictions')]
        return predictions

Lets obtain baseline for random recommendation system so we can later compaer how much our system is better. 

(Hit rate is a time complex computation process)


In [27]:
normal_predictor = NormalPredictor()
get_algo_results(normal_predictor, trainset, testset)
get_hitrate_results(normal_predictor, train_loocv, test_loocv)    

RMSE: 1.5063
HitRate: 0.015231788079470199


[Prediction(uid=1, iid=1357, r_ui=3.58086904741548, est=4.4901512848370855, details={'was_impossible': False}),
 Prediction(uid=1, iid=3068, r_ui=3.58086904741548, est=3.8008886416455208, details={'was_impossible': False}),
 Prediction(uid=1, iid=1537, r_ui=3.58086904741548, est=4.033431976155324, details={'was_impossible': False}),
 Prediction(uid=1, iid=647, r_ui=3.58086904741548, est=4.708134922900214, details={'was_impossible': False}),
 Prediction(uid=1, iid=2194, r_ui=3.58086904741548, est=3.3609757095729464, details={'was_impossible': False}),
 Prediction(uid=1, iid=648, r_ui=3.58086904741548, est=2.667904255548537, details={'was_impossible': False}),
 Prediction(uid=1, iid=2268, r_ui=3.58086904741548, est=3.0663215394251555, details={'was_impossible': False}),
 Prediction(uid=1, iid=2628, r_ui=3.58086904741548, est=3.5352561958769617, details={'was_impossible': False}),
 Prediction(uid=1, iid=1103, r_ui=3.58086904741548, est=2.306425039892555, details={'was_impossible': False})

## Collaborative filtering for movies recommendation

### KNN

In [14]:
algo_kwargs = dict(k=50, sim_options={'name': 'pearson', 'user_based': True, 'verbose' : True})
get_algorithm_report(KNNBasicWithTqdm, trainset, testset, train_loocv, test_loocv, movies_df,
                     target_movie_id='movie_1', target_user_id='user_1', top_k=10,
                    algo_kwargs=algo_kwargs, calc_most_similar=False)

Computing the pearson similarity matrix...
Done computing similarity matrix.


making predictions:   0%|          | 0/123759 [00:00<?, ?it/s]

RMSE: 0.9694
Computing the pearson similarity matrix...
Done computing similarity matrix.


making predictions:   0%|          | 0/3020 [00:00<?, ?it/s]

making predictions:   0%|          | 0/10461524 [00:00<?, ?it/s]

HitRate: 0.00033112582781456954
Top predictions for user user_1:
movie_name    genre
------------  -------


The RMSE is better than random but the hit rate is worse. We can also see that the top predictions for user_1 seem even further away than those that were given by the SVD algorithm. In user-based KNN there is no notion of similarity between items, so we won’t calculate the most similar movies to Toy Story (1995) for this algorithm.

In [15]:
algo_kwargs = dict(k=50, sim_options={'name': 'pearson', 'user_based': False, 'verbose' : True})
get_algorithm_report(KNNBasicWithTqdm, trainset, testset, train_loocv, test_loocv, movies_df,
                     target_movie_id='movie_1', target_user_id='user_1', top_k=10,
                     algo_kwargs=algo_kwargs, calc_most_similar=True)


Computing the pearson similarity matrix...
Done computing similarity matrix.


making predictions:   0%|          | 0/123759 [00:00<?, ?it/s]

RMSE: 1.0220
Computing the pearson similarity matrix...
Done computing similarity matrix.


making predictions:   0%|          | 0/3020 [00:00<?, ?it/s]

making predictions:   0%|          | 0/10461524 [00:00<?, ?it/s]

HitRate: 0.0


ValueError: Item movie_1 is not part of the trainset.

## Recommending movies with content-based filtering

For the content-based filtering we will use KNN-based algorithms in three approaches (two of them item-based and one user-based):

Movie plots (item-based): Create a vector representation of all of the movies based on the plot descriptions. We will do this by first stemming all of the words in the plot description and then applying TF-IDF to vectorize each document. The similarity matrices we will generate will be based on:
a. Using the complete TF-IDF matrix

b. Using the TF-IDF matrix after feature selection

c. Using the TF-IDF matrix after feature selection and after removing peoples’ names

Movie genres (item-based): **** We will use the movie genres as the only source for recommendations and see how that goes.
User age+gender (user-based): We will use user data as features for our KNN predictor.

In [None]:
from surprise import AlgoBase, KNNBasic
from surprise.prediction_algorithms.knns import SymmetricAlgo

class CustomSimKNNAlgorithm(KNNBasic):
  def __init__(self, sim_options, k=40, min_k=1):
    SymmetricAlgo.__init__(self)
    self.sim_options = sim_options
    self.k = k
    self.min_k = min_k

  def fit(self, trainset, similarities): 
    AlgoBase.fit(self, trainset)
    self.sim = similarities
    ub = self.sim_options['user_based']
    self.n_x = self.trainset.n_users if ub else self.trainset.n_items
    self.n_y = self.trainset.n_items if ub else self.trainset.n_users
    self.xr = self.trainset.ur if ub else self.trainset.ir
    self.yr = self.trainset.ir if ub else self.trainset.ur
  
  def test(self, testset, verbose=False):
    # The ratings are translated back to their original scale.
    predictions = [self.predict(uid,
                                iid,
                                r_ui_trans,
                                verbose=verbose)
                    for (uid, iid, r_ui_trans) in tqdm(testset, desc='making predictions')]
    return predictions

### Movie plots

In [None]:
stemmer = SnowballStemmer('english')
movies_df['movie_plot'] = movies_df['movie_plot'].apply(lambda x:' '.join([stemmer.stem(y) for y in x.split()]))

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_df['movie_plot'])

tfidf_df = pd.DataFrame(
    tfidf_matrix.todense(),
    columns=tfidf.get_feature_names()
)

Now, get the results for approach #1 (using the complete TF-IDF matrix). Note that we need a different cosine similarity matrix for the regular trainset and the leave-one-out trainset which is used to calculate hit-rate because they contain different inner ids for each movie.

In [None]:
item_matrix_all_words_trainset_loocv = get_item_matrix_with_inner_ids(tfidf_matrix.todense(), movies_df, train_loocv)
cosine_sim_all_words_trainset_loocv = cosine_similarity(item_matrix_all_words_trainset_loocv,
                                                        item_matrix_all_words_trainset_loocv)
item_matrix_all_words_trainset = get_item_matrix_with_inner_ids(tfidf_matrix.todense(), movies_df, trainset)
cosine_sim_all_words_trainset = cosine_similarity(item_matrix_all_words_trainset, item_matrix_all_words_trainset)

get_algorithm_report(CustomSimKNNAlgorithm, trainset, testset, train_loocv, test_loocv, movies_df,
                     target_movie_id='movie_1', target_user_id='user_1', top_k=10,
                     algo_kwargs_trainset=dict(similarities=cosine_sim_all_words_trainset, sim_options={'user_based': False}),
                     algo_kwargs_trainset_loocv=dict(similarities=cosine_sim_all_words_trainset_loocv, sim_options={'user_based': False}))


Worse than collaborative filtering, without a doubt. We also see that the similarity matrix doesn’t really generate meaningful relationships between movies (the only children’s movie which is similar to Toy Story is Toy Story 2). Let’s see if reducing the number of features helps.