In [749]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from thefuzz import process

from sklearn.impute import KNNImputer 
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

In [501]:
# Import ratings submitted by users and list of films
ratings = pd.read_csv('./data/ml-latest-small/ratings.csv', index_col=0)
movies = pd.read_csv('./data/ml-latest-small/movies.csv', index_col=0)

#### Data wrangling

In [541]:
ratings.sample()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
545,,,,,,,,,,,...,,,,,,,,,,


In [542]:
movies.sample()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
165483,Joe Rogan: Triggered (2016),Comedy


In [503]:
# Preserve userId column for merge
ratings.reset_index(inplace=True)

In [504]:
# Merge ratings and movies tables
ratings = ratings.merge(movies, on='movieId')

In [505]:
# Pivot table so userIds are index and movies are columns
ratings = pd.pivot_table(ratings, 
               index='userId',
               columns='title',
               values='rating'
              )

In [747]:
# Keep only the films that have been rated by 20 users or more
ratings_20 = ratings.loc[:,ratings.count()>=20]

In [748]:
# Movies to recommend have been cut to a pool of 1297 movies
ratings.shape, ratings_20.shape

((610, 9719), (610, 1297))

### Baseline recommender - Most popular films

In [745]:
# Get top 10 films with highest mean score (that have been seen at least 20 times)
recommended_films = dict(ratings_20.mean().round(2).sort_values(ascending=False)[:10])

In [746]:
recommended_films

{'Shawshank Redemption, The (1994)': 2.3,
 'Forrest Gump (1994)': 2.24,
 'Pulp Fiction (1994)': 2.11,
 'Matrix, The (1999)': 1.91,
 'Silence of the Lambs, The (1991)': 1.9,
 'Star Wars: Episode IV - A New Hope (1977)': 1.74,
 'Braveheart (1995)': 1.56,
 'Fight Club (1999)': 1.52,
 "Schindler's List (1993)": 1.52,
 'Jurassic Park (1993)': 1.46}

### Recommender #1 - Cosine similarity

#### Input user query

In [521]:
# CHECK - Match imprecise queries with movie title
process.extract(query='Toy stry', choices=ratings.columns)
#process.extractOne(query='Toy stry', choices=ratings.columns)[0]

[('Toy Soldiers (1991)', 86),
 ('Toy Story (1995)', 86),
 ('Toy Story 2 (1999)', 86),
 ('Toy Story 3 (2010)', 86),
 ('Toy, The (1982)', 86)]

In [520]:
# User input of favorite films - all automatically given top rating of 5.0
query = {'Aladdin (1992)':5, 'Twister (1996)':5, 'Up (2009)':5, 'Apocalypse Now (1979)':5}

In [522]:
query = pd.DataFrame(query, columns=ratings_20.columns, index=['query'])

In [523]:
# Append query to ratings DF
ratings_20 = ratings_20.append(query)
ratings_20 = ratings_20.fillna(0)

In [543]:
ratings_20.tail(2)

title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),13 Going on 30 (2004),"13th Warrior, The (1999)",1408 (2007),2001: A Space Odyssey (1968),2012 (2009),...,Young Frankenstein (1974),Young Guns (1988),Zack and Miri Make a Porno (2008),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016),eXistenZ (1999),xXx (2002),¡Three Amigos! (1986)
610,3.5,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.5,3.0,...,0.0,0.0,3.5,5.0,3.5,4.0,4.0,0.0,2.0,0.0
query,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [525]:
# Get cosine similarity between users for neighborhood collaborative filtering
cos_sim = pd.DataFrame(cosine_similarity(ratings_20), index=ratings_20.index, columns=ratings_20.index)

In [750]:
cos_sim.tail()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,602,603,604,605,606,607,608,609,610,query
607,0.321129,0.017039,0.041512,0.190457,0.185104,0.212785,0.229597,0.215017,0.017139,0.015923,...,0.247331,0.340445,0.156332,0.176618,0.253751,1.0,0.340524,0.210129,0.231955,0.054348
608,0.358892,0.06017,0.045089,0.213545,0.162385,0.254013,0.390761,0.21515,0.135898,0.116717,...,0.24137,0.3901,0.195952,0.248642,0.438898,0.340524,1.0,0.168918,0.481534,0.054118
609,0.127343,0.037881,0.0,0.032404,0.330324,0.341099,0.117024,0.514204,0.0,0.034631,...,0.445307,0.091049,0.30724,0.151563,0.115167,0.210129,0.168918,1.0,0.094918,0.0
610,0.23202,0.170663,0.062651,0.179662,0.093203,0.09602,0.291709,0.11492,0.113121,0.21788,...,0.138098,0.299828,0.081049,0.216617,0.395595,0.231955,0.481534,0.094918,1.0,0.045736
query,0.058092,0.0,0.0,0.044699,0.083478,0.103033,0.086367,0.0,0.0,0.127386,...,0.0,0.034253,0.04859,0.045497,0.027762,0.054348,0.054118,0.0,0.045736,1.0


In [527]:
# Keep only movies not seen by user
unseen_movies = list(ratings_20.columns[ratings_20.loc['query'] == 0])

In [528]:
# List top 5 most similar users to query user
neighbors_top5 = list(cos_sim['query'].sort_values(ascending=False).index[1:6])
neighbors_top5

[538, 31, 544, 243, 344]

In [529]:
# Get predicted ratings for movies for query user
predicted_ratings_movies = []

for movie in unseen_movies:
    
    # we check the users who watched the movie
    people_who_saw_movie = list(ratings_20.index[ratings_20[movie] > 0])
    
    num = 0
    den = 0
    for user in neighbors:
        # if this person has seen the movie
        if user in people_who_saw_movie:
        # we want extract the ratings and similarities
            rating = ratings_20.loc[user, movie]
            similarity = cos_sim.loc['query', user]
            
        # predict the rating based on the (weighted) average ratings of the neighbors
        # sum(ratings)/no.users OR 
        # sum(ratings*similarity)/sum(similarities)
            num = num + rating*similarity
            den = den + similarity
            
    try:
        predicted_ratings = num/den
    except:
        predicted_ratings = 0
    
    predicted_ratings_movies.append([predicted_ratings, movie])     

In [None]:
# CHECK - alternative to dividing by zero issue - change values in cos_sim matrix so that zeros are 0.001 for example

In [544]:
# Get list of top 10 recommended films
recs = pd.DataFrame(predicted_ratings_movies, columns = ['rating','movie'])
recs.sort_values(by='rating', ascending=False)[:10]

Unnamed: 0,rating,movie
393,5.0,Fantasia (1940)
1041,5.0,Sleepless in Seattle (1993)
61,5.0,"American President, The (1995)"
1023,5.0,Shrek 2 (2004)
1022,5.0,Shrek (2001)
1021,5.0,Showgirls (1995)
102,5.0,Back to the Future Part III (1990)
800,5.0,Mystery Science Theater 3000: The Movie (1996)
1013,5.0,"Shawshank Redemption, The (1994)"
343,5.0,Dr. Horrible's Sing-Along Blog (2008)


### Recommender #2 - Non-negative matrix facorization (NMF)

#### Create matrix

"Model assumes  𝑅∼𝑃𝑄  where  𝑄  is a matrix that has every movie classified according to components and  𝑃  shows the user preferences for these components."

In [655]:
# Use new ratings DF with films seen by at least 30 users
ratings_30 = ratings.loc[:,ratings.count()>=30]
#ratings_30 = ratings_30.fillna(ratings_30.mean())

In [656]:
# Impute missing values
imputer = KNNImputer(n_neighbors=2)
ratings_30 = pd.DataFrame(imputer.fit_transform(ratings_30), 
                       index = ratings_30.index, 
                       columns = ratings_30.columns)

In [657]:
ratings_30.head(3)

title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),2001: A Space Odyssey (1968),28 Days Later (2002),300 (2007),"40-Year-Old Virgin, The (2005)",50 First Dates (2004),...,X-Men: First Class (2011),X-Men: The Last Stand (2006),X2: X-Men United (2003),Yes Man (2008),You've Got Mail (1998),Young Frankenstein (1974),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.5,5.0,4.5,2.75,4.75,5.0,4.5,3.75,4.5,4.0,...,2.25,3.25,3.5,3.75,3.0,5.0,4.75,4.0,4.75,4.5
2,4.5,3.25,3.25,3.5,4.25,3.5,2.75,3.5,2.75,3.75,...,4.0,3.25,3.25,3.75,3.5,4.0,3.5,3.0,3.0,4.0
3,2.25,4.5,2.0,3.0,2.25,3.75,3.75,3.25,2.75,2.0,...,4.25,3.5,5.0,3.5,0.5,3.5,2.5,4.0,4.0,4.5


In [658]:
# Use NMF where n_components is no. of hidden features
nmf = NMF(n_components=30, init='random', random_state=10)

In [659]:
# Calculate Q
nmf.fit(ratings_30)



NMF(init='random', n_components=30, random_state=10)

In [660]:
# Get Q as array
nmf.components_

array([[0.41622506, 0.01022921, 0.19435719, ..., 0.        , 0.03552236,
        0.33391917],
       [0.14070192, 0.26989408, 0.14777445, ..., 0.17969762, 0.17440408,
        0.2777335 ],
       [0.41588616, 0.24550968, 0.08649841, ..., 0.48442134, 0.366476  ,
        0.23894951],
       ...,
       [0.3632064 , 0.        , 0.46605304, ..., 0.26222289, 0.02712847,
        0.46924914],
       [0.73901066, 0.72050502, 0.47120536, ..., 0.01309514, 0.59264228,
        0.29948758],
       [0.30392739, 0.76692869, 0.50312252, ..., 0.51448359, 0.19005063,
        0.13693872]])

In [661]:
# Get movie weights per feature
Q = pd.DataFrame(nmf.components_, 
                 columns=ratings_30.columns, 
                 index=['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10',
                       'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20',
                       'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30'])
Q.head(3)

title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),2001: A Space Odyssey (1968),28 Days Later (2002),300 (2007),"40-Year-Old Virgin, The (2005)",50 First Dates (2004),...,X-Men: First Class (2011),X-Men: The Last Stand (2006),X2: X-Men United (2003),Yes Man (2008),You've Got Mail (1998),Young Frankenstein (1974),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016)
f1,0.416225,0.010229,0.194357,0.067587,0.291561,0.240221,0.160462,0.114907,0.01163,0.360524,...,0.0,0.133768,0.146827,0.168938,0.197211,0.365769,0.226067,0.0,0.035522,0.333919
f2,0.140702,0.269894,0.147774,0.286795,0.038072,0.0,0.02884,0.117852,0.021485,0.22906,...,0.0,0.347368,0.220798,0.131113,0.285063,0.124013,0.0,0.179698,0.174404,0.277733
f3,0.415886,0.24551,0.086498,0.298536,0.413838,0.006045,0.402667,0.24306,0.601103,0.371642,...,0.873933,0.38996,0.361512,0.500293,0.0,0.640914,0.666441,0.484421,0.366476,0.23895


#### Create user-genre matrix

In [662]:
# Get P - user weights for each genre based on their movie ratings
P = pd.DataFrame(nmf.transform(ratings_30), 
                 columns=['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10',
                       'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20',
                       'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30'], 
                 index=ratings_30.index)
P.head(3)



Unnamed: 0_level_0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f21,f22,f23,f24,f25,f26,f27,f28,f29,f30
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.164705,1.179415,1.179321,0.0,0.392548,0.18167,0.163804,0.06819,0.527197,0.965633,...,0.0,0.048259,0.725243,0.594515,0.0,0.359766,0.082881,0.365485,0.509693,0.0
2,0.973909,0.905709,0.705719,0.892225,0.134915,0.049371,0.477994,0.538322,0.226188,0.449396,...,0.545857,0.366758,0.226646,0.583,0.453794,0.264078,0.304257,0.0,0.226778,0.0
3,0.0,0.0,0.61989,0.755967,0.383858,0.343631,0.0,0.536926,0.0,1.206645,...,0.440148,1.278765,0.736956,0.0,0.0,0.648057,1.036309,0.0,0.0,0.0


#### Create reconstructed matrix R

In [663]:
recommendations_reconstructed = pd.DataFrame(np.dot(P, Q), 
                                  index=ratings_30.index, 
                                  columns=ratings_30.columns)

In [664]:
recommendations_reconstructed.head(3)

title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),2001: A Space Odyssey (1968),28 Days Later (2002),300 (2007),"40-Year-Old Virgin, The (2005)",50 First Dates (2004),...,X-Men: First Class (2011),X-Men: The Last Stand (2006),X2: X-Men United (2003),Yes Man (2008),You've Got Mail (1998),Young Frankenstein (1974),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.518453,4.327383,3.648536,3.91138,4.425452,4.77009,4.458027,3.806929,3.906494,4.223837,...,3.720128,3.461846,4.097356,4.330515,3.812573,4.550309,4.329197,3.241258,4.069784,4.604912
2,3.962784,3.659666,2.998038,3.375015,3.955055,3.833246,3.790746,3.324752,3.170908,3.907534,...,3.797196,3.645805,3.853511,3.587495,3.443462,3.742861,3.972858,3.229494,3.402846,3.78137
3,2.846602,3.628741,2.370867,3.377242,3.283612,3.848126,3.403793,3.332322,2.57242,2.739323,...,3.682794,3.594785,4.572638,3.449835,1.521187,3.50494,3.151594,3.939838,3.474495,3.802648


In [665]:
# Check difference from original ratings - aim is to minimize this
abs(ratings_30 - recommendations_reconstructed).round(4).head(3)

title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),2001: A Space Odyssey (1968),28 Days Later (2002),300 (2007),"40-Year-Old Virgin, The (2005)",50 First Dates (2004),...,X-Men: First Class (2011),X-Men: The Last Stand (2006),X2: X-Men United (2003),Yes Man (2008),You've Got Mail (1998),Young Frankenstein (1974),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0185,0.6726,0.8515,1.1614,0.3245,0.2299,0.042,0.0569,0.5935,0.2238,...,1.4701,0.2118,0.5974,0.5805,0.8126,0.4497,0.4208,0.7587,0.6802,0.1049
2,0.5372,0.4097,0.252,0.125,0.2949,0.3332,1.0407,0.1752,0.4209,0.1575,...,0.2028,0.3958,0.6035,0.1625,0.0565,0.2571,0.4729,0.2295,0.4028,0.2186
3,0.5966,0.8713,0.3709,0.3772,1.0336,0.0981,0.3462,0.0823,0.1776,0.7393,...,0.5672,0.0948,0.4274,0.0502,1.0212,0.0049,0.6516,0.0602,0.5255,0.6974


In [666]:
# Get error
nmf.reconstruction_err_

395.44097363725353

#### Get user query

In [700]:
# Ratings inputted by the user for movies
query_1 = {'Aladdin (1992)':2, 'Twister (1996)':3, 'Up (2009)':4.5, 'Apocalypse Now (1979)':5}

In [701]:
query_1 = pd.DataFrame(query_1, 
                         columns=ratings_30.columns, 
                         index=['query']).fillna(0)

In [702]:
query_1

title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),2001: A Space Odyssey (1968),28 Days Later (2002),300 (2007),"40-Year-Old Virgin, The (2005)",50 First Dates (2004),...,X-Men: First Class (2011),X-Men: The Last Stand (2006),X2: X-Men United (2003),Yes Man (2008),You've Got Mail (1998),Young Frankenstein (1974),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016)
query,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [706]:
# Create filter for films seen by user
watched_filter = query_1.loc[:, query_1.sum() > 0]

In [708]:
P_query = nmf.transform(query_1)

In [709]:
P_query

array([[0.        , 0.        , 0.        , 0.        , 0.01282094,
        0.        , 0.        , 0.        , 0.00712388, 0.        ,
        0.        , 0.        , 0.        , 0.020343  , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.00065098,
        0.        , 0.03298579, 0.        , 0.        , 0.        ]])

In [710]:
R_query = np.dot(P_query, Q)

In [711]:
recommendations_query = pd.DataFrame(R_query,
                                index=['query'],
                                columns=ratings_30.columns)

In [712]:
recommendations_query

title,(500) Days of Summer (2009),10 Things I Hate About You (1999),101 Dalmatians (1996),101 Dalmatians (One Hundred and One Dalmatians) (1961),12 Angry Men (1957),2001: A Space Odyssey (1968),28 Days Later (2002),300 (2007),"40-Year-Old Virgin, The (2005)",50 First Dates (2004),...,X-Men: First Class (2011),X-Men: The Last Stand (2006),X2: X-Men United (2003),Yes Man (2008),You've Got Mail (1998),Young Frankenstein (1974),Zodiac (2007),Zombieland (2009),Zoolander (2001),Zootopia (2016)
query,0.040718,0.04041,0.023324,0.028092,0.010785,0.036549,0.041932,0.025376,0.015835,0.025707,...,0.01055,0.021598,0.026236,0.021504,0.015755,0.002228,0.050322,0.01714,0.007642,0.02632


In [740]:
# Keep only unseen movies by user
recs_unseen = recommendations_query.drop(columns=list(watched_filter.columns))

In [741]:
# Get top 10 recommended films
recommended_films = dict((recs_unseen.sum()).round(2).sort_values(ascending=False)[:10])

In [742]:
recommended_films

{'Godfather, The (1972)': 0.07,
 'Shrek (2001)': 0.07,
 'Star Trek (2009)': 0.06,
 'Slumdog Millionaire (2008)': 0.06,
 'Big Fish (2003)': 0.06,
 'Harry Potter and the Chamber of Secrets (2002)': 0.06,
 'Lord of the Rings: The Fellowship of the Ring, The (2001)': 0.06,
 'Apollo 13 (1995)': 0.05,
 'Pirates of the Caribbean: The Curse of the Black Pearl (2003)': 0.05,
 'Silence of the Lambs, The (1991)': 0.05}

#### One-hot-encode movie genres

In [381]:
def ohe_genres(movies):
    """
    Takes MovieLens data 'movies' DataFrame and 
    one-hot-encodes the 'genres' column.
    """
    for i, row in movies.iterrows():
        # Extract genres using regex
        genres = movies['genres'][i]
        genre_text = re.sub('\|', ' ', genres)
        # Convert genres into list
        genre_list = list(str.split(genre_text))
        # Replace each 'genres' column value with genre list
        movies['genres'][i] = genre_list
    
    # One-hot-encode genres
    genre_columns = movies['genres'].explode()
    ohe_genres = movies[['title']].join(pd.crosstab(s.index, s))
    # Drop non-genre columns
    ohe_genres = ohe_genres.drop(columns=['(no', 'genres', 'listed)'])
    # Reset index
    ohe_genres.reset_index(inplace=True)
    
    return ohe_genres

In [382]:
ohe_genres = ohe_genres(movies)

In [535]:
ohe_genres.head()

Unnamed: 0,movieId,title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
