In [1]:
import pandas as pd
import numpy as np

In [2]:
movies_df = pd.read_csv('movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [4]:
ratings_df = pd.read_csv('ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [6]:
genres_list = 'Adventure|Animation|Children|Comedy|Fantasy'.split('|')
print(genres_list)

['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy']


In [7]:
movies_df['genres'] = movies_df['genres'].str.split('|')
movies_df.head(1)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"


In [8]:
movies_df_encoded = movies_df.copy(deep=True)
genres_cols = []

for index, row in movies_df_encoded.iterrows():
    genres_cols.append(index)
    for g in row['genres']:
        movies_df_encoded.at[index, g] = 1

movies_df_encoded.head()

Unnamed: 0,movieId,title,genres,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1.0,1.0,1.0,1.0,1.0,,,...,,,,,,,,,,
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",1.0,,1.0,,1.0,,,...,,,,,,,,,,
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",,,,1.0,,1.0,,...,,,,,,,,,,
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",,,,1.0,,1.0,1.0,...,,,,,,,,,,
4,5,Father of the Bride Part II (1995),[Comedy],,,,1.0,,,,...,,,,,,,,,,


In [9]:
movies_df_encoded.fillna(0, inplace=True)
movies_df_encoded.head()

Unnamed: 0,movieId,title,genres,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",1.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",0.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),[Comedy],0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
ratings_df.drop('timestamp', axis=1, inplace=True)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [11]:
john_doe_ratings = [
    {'title': 'Get Shorty (1995)', 'rating': 4},
    {'title': 'Seven (a.k.a. Se7en) (1995)', 'rating': 5},
    {'title': 'Angels and Insects (1995)', 'rating': 2},
    {'title': 'While You Were Sleeping (1995)', 'rating': 3},
    {'title': 'Celluloid Closet, The (1995)', 'rating': 4},
    {'title': 'Twister (1996)', 'rating': 5},
    {'title': 'Play It Again, Sam (1972)', 'rating': 3},
    {'title': 'Last Samurai, The (2003)', 'rating': 5}
]
john_doe_df = pd.DataFrame(john_doe_ratings)
john_doe_df.head()

Unnamed: 0,title,rating
0,Get Shorty (1995),4
1,Seven (a.k.a. Se7en) (1995),5
2,Angels and Insects (1995),2
3,While You Were Sleeping (1995),3
4,"Celluloid Closet, The (1995)",4


In [12]:
john_doe_ids = movies_df[movies_df['title'].isin(john_doe_df['title'])]
john_doe_ids.head()

Unnamed: 0,movieId,title,genres
20,21,Get Shorty (1995),"[Comedy, Crime, Thriller]"
43,47,Seven (a.k.a. Se7en) (1995),"[Mystery, Thriller]"
76,85,Angels and Insects (1995),"[Drama, Romance]"
297,339,While You Were Sleeping (1995),"[Comedy, Romance]"
501,581,"Celluloid Closet, The (1995)",[Documentary]


In [13]:
john_doe_df = pd.merge(john_doe_df, john_doe_ids, on='title')
john_doe_df.head()

Unnamed: 0,title,rating,movieId,genres
0,Get Shorty (1995),4,21,"[Comedy, Crime, Thriller]"
1,Seven (a.k.a. Se7en) (1995),5,47,"[Mystery, Thriller]"
2,Angels and Insects (1995),2,85,"[Drama, Romance]"
3,While You Were Sleeping (1995),3,339,"[Comedy, Romance]"
4,"Celluloid Closet, The (1995)",4,581,[Documentary]


In [15]:
john_doe_rating_genres = movies_df_encoded[movies_df_encoded.movieId.isin(john_doe_df.movieId)]
john_doe_rating_genres.head()

Unnamed: 0,movieId,title,genres,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
20,21,Get Shorty (1995),"[Comedy, Crime, Thriller]",0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43,47,Seven (a.k.a. Se7en) (1995),"[Mystery, Thriller]",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
76,85,Angels and Insects (1995),"[Drama, Romance]",0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
297,339,While You Were Sleeping (1995),"[Comedy, Romance]",0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
501,581,"Celluloid Closet, The (1995)",[Documentary],0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [16]:
john_doe_df.drop(columns=['genres'], inplace=True)
john_doe_df.head()

Unnamed: 0,title,rating,movieId
0,Get Shorty (1995),4,21
1,Seven (a.k.a. Se7en) (1995),5,47
2,Angels and Insects (1995),2,85
3,While You Were Sleeping (1995),3,339
4,"Celluloid Closet, The (1995)",4,581


In [17]:
john_doe_rating_genres.drop(columns=['movieId', 'title','genres'], inplace=True)
john_doe_rating_genres.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  john_doe_rating_genres.drop(columns=['movieId', 'title','genres'], inplace=True)


Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
20,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
76,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
297,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
501,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [18]:
john_doe_rating_genres.reset_index(drop=True, inplace=True)
john_doe_rating_genres.head()

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [20]:
print(john_doe_df.shape)
print(john_doe_rating_genres.shape)

(8, 3)
(8, 20)


In [22]:
john_doe_profile = john_doe_rating_genres.T.dot(john_doe_df['rating'])
print(john_doe_profile)

Adventure             10.0
Animation              0.0
Children               0.0
Comedy                10.0
Fantasy                0.0
Romance               13.0
Drama                  7.0
Action                10.0
Crime                  4.0
Thriller              14.0
Horror                 0.0
Mystery                5.0
Sci-Fi                 0.0
War                    5.0
Musical                0.0
Documentary            4.0
IMAX                   0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64


In [23]:
movies_df_encoded = movies_df_encoded.set_index('movieId')
movies_df_encoded.head()

Unnamed: 0_level_0,title,genres,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Jumanji (1995),"[Adventure, Children, Fantasy]",1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Grumpier Old Men (1995),"[Comedy, Romance]",0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Father of the Bride Part II (1995),[Comedy],0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
movies_df_encoded = movies_df_encoded.drop(columns=['title', 'genres'])
movies_df_encoded.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
print(movies_df_encoded.shape)
print(john_doe_profile.shape)

(9742, 20)
(20,)


In [26]:
john_doe_recommend = movies_df_encoded.dot(john_doe_profile) / john_doe_profile.sum()
john_doe_recommend.head(10)

movieId
1     0.243902
2     0.121951
3     0.280488
4     0.365854
5     0.121951
6     0.341463
7     0.280488
8     0.121951
9     0.121951
10    0.414634
dtype: float64

In [27]:
print(len(john_doe_recommend))

9742


In [28]:
john_doe_recommend.sort_values(ascending=False).head(10)

movieId
4956     0.780488
31367    0.743902
81132    0.731707
459      0.707317
4719     0.707317
380      0.695122
6564     0.695122
26236    0.670732
71999    0.658537
4800     0.658537
dtype: float64

In [29]:
movies_df_reset = movies_df.set_index('movieId', drop=True)
movies_df_reset.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
2,Jumanji (1995),"[Adventure, Children, Fantasy]"
3,Grumpier Old Men (1995),"[Comedy, Romance]"
4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
5,Father of the Bride Part II (1995),[Comedy]


In [31]:
top_recommend_scores = john_doe_recommend.sort_values(ascending=False).head(10)
top_recommend_movies = movies_df_reset.loc[top_recommend_scores.index]

print(top_recommend_movies[['title']])

                                                     title
movieId                                                   
4956                                 Stunt Man, The (1980)
31367                                    Chase, The (1994)
81132                                        Rubber (2010)
459                                    Getaway, The (1994)
4719                                  Osmosis Jones (2001)
380                                       True Lies (1994)
6564     Lara Croft Tomb Raider: The Cradle of Life (2003)
26236    White Sun of the Desert, The (Beloe solntse pu...
71999            Aelita: The Queen of Mars (Aelita) (1924)
4800                           King Solomon's Mines (1937)
