### 1. Import libraries

In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD, evaluate

import warnings; warnings.simplefilter



### 2. Load datasets 

The datasets we arae about to load is the result of hours of cleaning wich you will find in our data directory in our repo. 

In [2]:
ratings = pd.read_csv('ratings.csv')
movies_df = pd.read_csv('movies_merged.csv')

### 3. Understanding ou dataset 

#### Ratings dataframe

In [3]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [4]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [5]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26024289 entries, 0 to 26024288
Data columns (total 4 columns):
userId       int64
movieId      int64
rating       float64
timestamp    int64
dtypes: float64(1), int64(3)
memory usage: 794.2 MB


In [6]:
ratings.count()

userId       26024289
movieId      26024289
rating       26024289
timestamp    26024289
dtype: int64

#### Movies dataframe

In [7]:
movies_df.head()

Unnamed: 0,movieId,Title,genres,released,Actors,Director
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995,"Tom Hanks, Tim Allen, Don Rickles, Jim Varney",John Lasseter
1,2,Jumanji,Adventure|Children|Fantasy,1995,"Robin Williams, Jonathan Hyde, Kirsten Dunst, ...",Joe Johnston
2,3,Grumpier Old Men,Comedy|Romance,1995,"Walter Matthau, Jack Lemmon, Sophia Loren, Ann...",Howard Deutch
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995,"Whitney Houston, Angela Bassett, Loretta Devin...",Forest Whitaker
4,5,Father of the Bride Part II,Comedy,1995,"Steve Martin, Diane Keaton, Martin Short, Kimb...",Charles Shyer


In [8]:
movies_df.columns

Index(['movieId', 'Title', 'genres', 'released', 'Actors', 'Director'], dtype='object')

In [9]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31779 entries, 0 to 31778
Data columns (total 6 columns):
movieId     31779 non-null int64
Title       31779 non-null object
genres      31779 non-null object
released    31779 non-null int64
Actors      31779 non-null object
Director    31779 non-null object
dtypes: int64(2), object(4)
memory usage: 1.5+ MB


### 4. Data transformation 

Now that we understood our data, we are going to transform our data in order to archieve efficiency. As data scientists we have to always think ahead and predict what (continue to write)

#### Counting number of ratings per user 

In [10]:
user_ratings_grp = pd.DataFrame(ratings.groupby('userId')['rating'].count())
user_ratings_grp = user_ratings_grp.rename(columns = {'rating': 'rating_ct'})
user_ratings_grp.reset_index(inplace=True)
user_ratings_grp.head()

Unnamed: 0,userId,rating_ct
0,1,27
1,2,22
2,3,10
3,4,62
4,5,26


#### Find average of ratings per user 

In [11]:
avg_nratings = user_ratings_grp['rating_ct'].mean()
avg_nratings

96.06745393065974

#### Qualifying user's rating by setting a cut line of 20 of the mean 

In [12]:
top_user_ratings = user_ratings_grp[(user_ratings_grp['rating_ct'] >= 85) & (user_ratings_grp['rating_ct'] <= 105)]
top_user_ratings.shape


(11910, 2)

In [13]:
top_user_ratings.head()

Unnamed: 0,userId,rating_ct
59,60,105
75,76,105
88,89,100
106,107,87
114,115,105


#### Joining newly created table to our original ratings table (top_user_ratings + ratings)

In [14]:
new_ratings = pd.merge(top_user_ratings, ratings, how = 'inner', on = 'userId')
new_ratings.head()

Unnamed: 0,userId,rating_ct,movieId,rating,timestamp
0,60,105,112,2.5,1136304271
1,60,105,153,2.5,1136306947
2,60,105,163,4.0,1136304313
3,60,105,165,4.0,1136306968
4,60,105,168,0.5,1136304366


#### Dropping timestamp column 

In [15]:
new_ratings = new_ratings.drop(columns=['timestamp'])
new_ratings.head()

Unnamed: 0,userId,rating_ct,movieId,rating
0,60,105,112,2.5
1,60,105,153,2.5
2,60,105,163,4.0
3,60,105,165,4.0
4,60,105,168,0.5


In [16]:
new_ratings.shape

(1133047, 4)

In [17]:
new_ratings.count()

userId       1133047
rating_ct    1133047
movieId      1133047
rating       1133047
dtype: int64

In [18]:
new_ratings.to_csv('new_ratings.csv', index=False)


#### Removing spaces from actors and director names

In [19]:
def clean_data(x):
    if isinstance(x, list):
        return[str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [20]:
credits = ['Actors', 'Director']

for credit in credits:
    movies_df[credit] = movies_df[credit].apply(clean_data)

In [21]:
# remove | from genres

movies_df['genres'] = movies_df['genres'].str.replace("|", " ")

In [22]:
movies_df.head()

Unnamed: 0,movieId,Title,genres,released,Actors,Director
0,1,Toy Story,Adventure Animation Children Comedy Fantasy,1995,"tomhanks,timallen,donrickles,jimvarney",johnlasseter
1,2,Jumanji,Adventure Children Fantasy,1995,"robinwilliams,jonathanhyde,kirstendunst,bradle...",joejohnston
2,3,Grumpier Old Men,Comedy Romance,1995,"waltermatthau,jacklemmon,sophialoren,ann-margret",howarddeutch
3,4,Waiting to Exhale,Comedy Drama Romance,1995,"whitneyhouston,angelabassett,lorettadevine,lel...",forestwhitaker
4,5,Father of the Bride Part II,Comedy,1995,"stevemartin,dianekeaton,martinshort,kimberlywi...",charlesshyer


#### Combining data 

In [23]:
def create_soup(x):
    return  x['Actors'].replace(',',' ') + ' ' + x['Director'] + ' ' + ' '.join(x['genres'])


In [24]:
movies_df['bag_of_words'] = movies_df.apply(create_soup, axis=1)

In [25]:
movies_df.head()

Unnamed: 0,movieId,Title,genres,released,Actors,Director,bag_of_words
0,1,Toy Story,Adventure Animation Children Comedy Fantasy,1995,"tomhanks,timallen,donrickles,jimvarney",johnlasseter,tomhanks timallen donrickles jimvarney johnlas...
1,2,Jumanji,Adventure Children Fantasy,1995,"robinwilliams,jonathanhyde,kirstendunst,bradle...",joejohnston,robinwilliams jonathanhyde kirstendunst bradle...
2,3,Grumpier Old Men,Comedy Romance,1995,"waltermatthau,jacklemmon,sophialoren,ann-margret",howarddeutch,waltermatthau jacklemmon sophialoren ann-margr...
3,4,Waiting to Exhale,Comedy Drama Romance,1995,"whitneyhouston,angelabassett,lorettadevine,lel...",forestwhitaker,whitneyhouston angelabassett lorettadevine lel...
4,5,Father of the Bride Part II,Comedy,1995,"stevemartin,dianekeaton,martinshort,kimberlywi...",charlesshyer,stevemartin dianekeaton martinshort kimberlywi...


In [26]:
movies_bow = movies_df[['Title', 'movieId', 'bag_of_words']]
movies_bow.to_csv('movies_bow.csv', index=False)
movies_bow.head()

Unnamed: 0,Title,movieId,bag_of_words
0,Toy Story,1,tomhanks timallen donrickles jimvarney johnlas...
1,Jumanji,2,robinwilliams jonathanhyde kirstendunst bradle...
2,Grumpier Old Men,3,waltermatthau jacklemmon sophialoren ann-margr...
3,Waiting to Exhale,4,whitneyhouston angelabassett lorettadevine lel...
4,Father of the Bride Part II,5,stevemartin dianekeaton martinshort kimberlywi...


In [27]:
movies_bow = pd.read_csv('movies_bow.csv')
movies_bow.head()

Unnamed: 0,Title,movieId,bag_of_words
0,Toy Story,1,tomhanks timallen donrickles jimvarney johnlas...
1,Jumanji,2,robinwilliams jonathanhyde kirstendunst bradle...
2,Grumpier Old Men,3,waltermatthau jacklemmon sophialoren ann-margr...
3,Waiting to Exhale,4,whitneyhouston angelabassett lorettadevine lel...
4,Father of the Bride Part II,5,stevemartin dianekeaton martinshort kimberlywi...


### 5.1. Building a recommender system 
#### Importing sklearn moules

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#### Initializing and generating the count matrix 

In [29]:
count = CountVectorizer()
count_matrix = count.fit_transform(movies_bow['bag_of_words'])

#### Generating the cosine similarity matrix

In [30]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)


In [31]:
indices = pd.Series(movies_bow.index, index = movies_bow['Title'])

def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:26]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies_bow['Title'].iloc[movie_indices]
                        

In [32]:
get_recommendations('Rumble in the Bronx')

25690                   Kung Fu Yoga
5236                       Who Am I?
12179                           1911
4345                     Crime Story
23732         Jackie Chan: My Stunts
924                       Supercop 2
1532                       Mr. Magoo
1833                       Rush Hour
2064                      Black Mask
2551                   Shanghai Noon
3034                     Rush Hour 2
4826     Around the World in 80 Days
5123                     City Hunter
5948              China Strike Force
6611                     Thunderbolt
7172                     Rush Hour 3
7638                   July Rhapsody
17276                Personal Tailor
18289                   Dragon Blade
24323                      Skiptrace
24836                Railroad Tigers
27902                  The Foreigner
29297         Earth: One Amazing Day
1675                    Mr. Nice Guy
3791                Shanghai Knights
Name: Title, dtype: object

### 5.2. CF based recommender system 

In [33]:
reader = Reader()

In [34]:
data = Dataset.load_from_df(new_ratings[['userId', 'movieId', 'rating']], reader)
data.split(n_folds=5)

In [35]:
svd = SVD()
evaluate(svd, data, measures=['RMSE', 'MAE'])



Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 0.8737
MAE:  0.6675
------------
Fold 2
RMSE: 0.8761
MAE:  0.6693
------------
Fold 3
RMSE: 0.8755
MAE:  0.6680
------------
Fold 4
RMSE: 0.8724
MAE:  0.6656
------------
Fold 5
RMSE: 0.8765
MAE:  0.6687
------------
------------
Mean RMSE: 0.8748
Mean MAE : 0.6678
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [0.8736678410779543,
                             0.876106547309535,
                             0.8755342608804385,
                             0.8723684660765281,
                             0.8765228682862168],
                            'mae': [0.6674660669470335,
                             0.6693259159372866,
                             0.6679865479362211,
                             0.6655883588593249,
                             0.6687269780677663]})

In [36]:
trainset = data.build_full_trainset()
svd.train(trainset)



<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a1f85fef0>

In [37]:
new_ratings[new_ratings['userId'] == 163904]

Unnamed: 0,userId,rating_ct,movieId,rating
687663,163904,105,47,5.0
687664,163904,105,50,4.5
687665,163904,105,111,4.5
687666,163904,105,260,4.0
687667,163904,105,296,5.0
687668,163904,105,527,4.5
687669,163904,105,541,3.0
687670,163904,105,593,4.5
687671,163904,105,608,5.0
687672,163904,105,750,5.0


In [40]:
svd.predict(163904, 302)

Prediction(uid=163904, iid=302, r_ui=None, est=4.104373499792094, details={'was_impossible': False})

In [40]:
mvd = pd.merge(new_ratings, movies_df, on='movieId')
mvd.head()

Unnamed: 0,userId,rating_ct,movieId,rating,Title,genres,released,Actors,Director,bag_of_words
0,60,105,112,2.5,Rumble in the Bronx,Action Adventure Comedy Crime,1995,"jackiechan,anitamui,françoiseyip,billtung",stanleytong,jackiechan anitamui françoiseyip billtung stan...
1,123,99,112,3.0,Rumble in the Bronx,Action Adventure Comedy Crime,1995,"jackiechan,anitamui,françoiseyip,billtung",stanleytong,jackiechan anitamui françoiseyip billtung stan...
2,131,87,112,2.0,Rumble in the Bronx,Action Adventure Comedy Crime,1995,"jackiechan,anitamui,françoiseyip,billtung",stanleytong,jackiechan anitamui françoiseyip billtung stan...
3,595,91,112,5.0,Rumble in the Bronx,Action Adventure Comedy Crime,1995,"jackiechan,anitamui,françoiseyip,billtung",stanleytong,jackiechan anitamui françoiseyip billtung stan...
4,627,85,112,3.0,Rumble in the Bronx,Action Adventure Comedy Crime,1995,"jackiechan,anitamui,françoiseyip,billtung",stanleytong,jackiechan anitamui françoiseyip billtung stan...


In [41]:
mvd.shape

(774593, 10)

In [42]:
mvd.count()

userId          774593
rating_ct       774593
movieId         774593
rating          774593
Title           774593
genres          774593
released        774593
Actors          774593
Director        774593
bag_of_words    774593
dtype: int64

In [43]:
movies_map = movies_df.copy()
movies_map.reset_index(inplace=True)
movies_map.head()

Unnamed: 0,index,movieId,Title,genres,released,Actors,Director,bag_of_words
0,0,1,Toy Story,Adventure Animation Children Comedy Fantasy,1995,"tomhanks,timallen,donrickles,jimvarney",johnlasseter,tomhanks timallen donrickles jimvarney johnlas...
1,1,2,Jumanji,Adventure Children Fantasy,1995,"robinwilliams,jonathanhyde,kirstendunst,bradle...",joejohnston,robinwilliams jonathanhyde kirstendunst bradle...
2,2,3,Grumpier Old Men,Comedy Romance,1995,"waltermatthau,jacklemmon,sophialoren,ann-margret",howarddeutch,waltermatthau jacklemmon sophialoren ann-margr...
3,3,4,Waiting to Exhale,Comedy Drama Romance,1995,"whitneyhouston,angelabassett,lorettadevine,lel...",forestwhitaker,whitneyhouston angelabassett lorettadevine lel...
4,4,5,Father of the Bride Part II,Comedy,1995,"stevemartin,dianekeaton,martinshort,kimberlywi...",charlesshyer,stevemartin dianekeaton martinshort kimberlywi...


In [44]:
movies_map.shape

(31779, 8)

In [45]:
movies_map = movies_map.rename(columns={'index': 'id'})
movies_map.head()

Unnamed: 0,id,movieId,Title,genres,released,Actors,Director,bag_of_words
0,0,1,Toy Story,Adventure Animation Children Comedy Fantasy,1995,"tomhanks,timallen,donrickles,jimvarney",johnlasseter,tomhanks timallen donrickles jimvarney johnlas...
1,1,2,Jumanji,Adventure Children Fantasy,1995,"robinwilliams,jonathanhyde,kirstendunst,bradle...",joejohnston,robinwilliams jonathanhyde kirstendunst bradle...
2,2,3,Grumpier Old Men,Comedy Romance,1995,"waltermatthau,jacklemmon,sophialoren,ann-margret",howarddeutch,waltermatthau jacklemmon sophialoren ann-margr...
3,3,4,Waiting to Exhale,Comedy Drama Romance,1995,"whitneyhouston,angelabassett,lorettadevine,lel...",forestwhitaker,whitneyhouston angelabassett lorettadevine lel...
4,4,5,Father of the Bride Part II,Comedy,1995,"stevemartin,dianekeaton,martinshort,kimberlywi...",charlesshyer,stevemartin dianekeaton martinshort kimberlywi...


### Hybrid recommendation system 

In [46]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [47]:
indices = pd.Series(movies_bow.index, index = movies_bow['Title'])


In [48]:
def hybrid(userId, title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[int(idx)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    movies = movies_map.iloc[movie_indices][['Title', 'id']]
    movies['est'] = movies['id'].apply(lambda x: svd.predict(userId, movies_map.loc[x]['movieId']).est)
    movies = movies.sort_values('est', ascending=False)
    top10 = movies['Title'][:11]
    return top10

In [49]:
hybrid(1, 'Rumble in the Bronx')

2064                 Black Mask
18289              Dragon Blade
5236                  Who Am I?
7638              July Rhapsody
12179                      1911
4345                Crime Story
23732    Jackie Chan: My Stunts
29297    Earth: One Amazing Day
27902             The Foreigner
24836           Railroad Tigers
17276           Personal Tailor
Name: Title, dtype: object