In [48]:
import pandas as pd
import numpy as np


In [49]:
user_ratings_df = pd.read_csv('user_ratings.csv')
user_ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [50]:
# Get the counts of occurrences of each movie title
movie_popularity = user_ratings_df["title"].value_counts()

# Inspect the most common values
print(movie_popularity.head().index)

Index(['Forrest Gump (1994)', 'Shawshank Redemption, The (1994)',
       'Pulp Fiction (1994)', 'Silence of the Lambs, The (1991)',
       'Matrix, The (1999)'],
      dtype='object')


Improved non-personalized recommendations

In [51]:
# Find the mean of the ratings given to each title
average_rating_df = user_ratings_df[["title", "rating"]].groupby('title').mean()

# Order the entries by highest average rating to lowest
sorted_average_ratings = average_rating_df.sort_values(by='rating', ascending=False)

# Inspect the top movies
print(sorted_average_ratings.head())

                                     rating
title                                      
Gena the Crocodile (1969)               5.0
True Stories (1986)                     5.0
Cosmic Scrat-tastrophe (2015)           5.0
Love and Pigeons (1985)                 5.0
Red Sorghum (Hong gao liang) (1987)     5.0


Combining popularity and reviews


In [52]:
# Create a list of only movies appearing > 50 times in the dataset
movie_popularity = user_ratings_df["title"].value_counts()
popular_movies = movie_popularity[movie_popularity > 50].index

# Use this popular_movies list to filter the original DataFrame
popular_movies_rankings =  user_ratings_df[user_ratings_df["title"].isin(popular_movies)]

# Find the average rating given to these frequently watched films
popular_movies_average_rankings = popular_movies_rankings[["title", "rating"]].groupby('title').mean()
print(popular_movies_average_rankings.sort_values(by="rating", ascending=False).head())

                                                      rating
title                                                       
Shawshank Redemption, The (1994)                    4.429022
Godfather, The (1972)                               4.289062
Fight Club (1999)                                   4.272936
Cool Hand Luke (1967)                               4.271930
Dr. Strangelove or: How I Learned to Stop Worry...  4.268041


Find all pairs of movies

**The course modified the dataframe to 192 rows, where the original is too many. so skip this code snippet**

In [53]:
from itertools import permutations

# Create the function to find all permutations
def find_movie_pairs(x):
  pairs = pd.DataFrame(list(permutations(x.values, 2)),
                       columns=['movie_a', 'movie_b'])
  return pairs

# Apply the function to the title column and reset the index
# movie_combinations = user_ratings_df.groupby('userId')['title'].apply(find_movie_pairs)

# print(movie_combinations)

In [54]:
# Calculate how often each item in movie_a occurs with the items in movie_b
combination_counts = movie_combinations.groupby(['movie_a', 'movie_b']).size()

# Convert the results to a DataFrame and reset the index
combination_counts_df = combination_counts.to_frame(name='size').reset_index()
print(combination_counts_df.head())

      movie_a                                     movie_b  size
0  '71 (2014)                 (500) Days of Summer (2009)     1
1  '71 (2014)                  10 Cloverfield Lane (2016)     1
2  '71 (2014)                            127 Hours (2010)     1
3  '71 (2014)  13 Assassins (Jûsan-nin no shikaku) (2010)     1
4  '71 (2014)                             13 Hours (2016)     1


In [55]:
# import matplotlib.pyplot as plt

# # Sort the counts from highest to lowest
# combination_counts_df.sort_values('size', ascending=False, inplace=True)

# # Find the movies most frequently watched by people who watched Thor
# thor_df = combination_counts_df[combination_counts_df['movie_a'] == 'Thor']

# # Plot the results
# thor_df.plot.bar(x="movie_b")
# plt.show()

Creating content based data

In [69]:
movie_genre_df = pd.read_csv('movies.csv')
movie_genre_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [70]:
movie_genre_df['name'] = movie_genre_df['title'].str[:-7]
movie_genre_df.drop('title',axis = 1, inplace=True)
# movie_genre_df.dropna('name', inplace=True)
movie_genre_df.head(10)

Unnamed: 0,movieId,genres,name
0,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story
1,2,Adventure|Children|Fantasy,Jumanji
2,3,Comedy|Romance,Grumpier Old Men
3,4,Comedy|Drama|Romance,Waiting to Exhale
4,5,Comedy,Father of the Bride Part II
5,6,Action|Crime|Thriller,Heat
6,7,Comedy|Romance,Sabrina
7,8,Adventure|Children,Tom and Huck
8,9,Action,Sudden Death
9,10,Action|Adventure|Thriller,GoldenEye


In [79]:
def splitwords(x):

    genres = x.split('|')

    # Extract the first word
    return genres[0]

movie_genre_df['genre_list'] = movie_genre_df['genres'].apply(splitwords)

movie_genre_df.head(5)

Unnamed: 0,movieId,genres,name,genre_list
0,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,Adventure
1,2,Adventure|Children|Fantasy,Jumanji,Adventure
2,3,Comedy|Romance,Grumpier Old Men,Comedy
3,4,Comedy|Drama|Romance,Waiting to Exhale,Comedy
4,5,Comedy,Father of the Bride Part II,Comedy


In [80]:
movie_cross_table

genre_list,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
'71,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
'Hellboy': The Seeds of Creation,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
'Round Midnight,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
'Salem's Lot,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
xXx,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
xXx: State of the Union,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
¡Three Amigos!,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [81]:
# Select only the rows with values in the name column equal to Toy Story
toy_story_genres = movie_genre_df[movie_genre_df['name'] == 'Toy Story']

# Create cross-tabulated DataFrame from name and genre_list columns
movie_cross_table = pd.crosstab(movie_genre_df['name'], movie_genre_df['genre_list'])

print(movie_cross_table.index)

# Select only the rows with Toy Story as the index
toy_story_genres_ct = movie_cross_table[movie_cross_table.index == 'Toy Story']
print(toy_story_genres_ct)

Index(['', ''71', ''Hellboy': The Seeds of Creation', ''Round Midnight',
       ''Salem's Lot', ''Til There Was You', ''Tis the Season for Love',
       ''burbs, The', ''night Mother', '(500) Days of Summer',
       ...
       'Zulu', '[REC]', '[REC]²', '[REC]³ 3 Génesis',
       'anohana: The Flower We Saw That Day - The Movie', 'eXistenZ', 'xXx',
       'xXx: State of the Union', '¡Three Amigos!',
       'À nous la liberté (Freedom for Us)'],
      dtype='object', name='name', length=9460)
genre_list  (no genres listed)  Action  Adventure  Animation  Children  \
name                                                                     
Toy Story                    0       0          1          0         0   

genre_list  Comedy  Crime  Documentary  Drama  Fantasy  Film-Noir  Horror  \
name                                                                        
Toy Story        0      0            0      0        0          0       0   

genre_list  Musical  Mystery  Romance  Sci-Fi  T

Comparing individual movies with Jaccard similarity


In [82]:
# Import numpy and the distance metric
import numpy as np
from sklearn.metrics import jaccard_score

# Extract just the rows containing GoldenEye and Toy Story
goldeneye_values = movie_cross_table.loc['GoldenEye'].values
toy_story_values = movie_cross_table.loc['Toy Story'].values

# Find the similarity between GoldenEye and Toy Story
print(jaccard_score(goldeneye_values, toy_story_values))

# Repeat for GoldenEye and Skyfall
skyfall_values = movie_cross_table.loc['Skyfall'].values
print(jaccard_score(goldeneye_values, skyfall_values))

0.0
1.0


In [83]:
# Import functions from scipy
from scipy.spatial.distance import pdist, squareform

# Calculate all pairwise distances
jaccard_distances = pdist(movie_cross_table.values, metric='jaccard')

# Convert the distances to a square matrix
jaccard_similarity_array = 1 - squareform(jaccard_distances)

# Wrap the array in a pandas DataFrame
jaccard_similarity_df = pd.DataFrame(jaccard_similarity_array, index=movie_cross_table.index, columns=movie_cross_table.index)

# Print the top 5 rows of the DataFrame
print(jaccard_similarity_df.head())

name                                   '71  'Hellboy': The Seeds of Creation  \
name                                                                           
                                  1.0  0.0                               0.0   
'71                               0.0  1.0                               1.0   
'Hellboy': The Seeds of Creation  0.0  1.0                               1.0   
'Round Midnight                   0.0  0.0                               0.0   
'Salem's Lot                      0.0  0.0                               0.0   

name                              'Round Midnight  'Salem's Lot  \
name                                                              
                                              0.0           0.0   
'71                                           0.0           0.0   
'Hellboy': The Seeds of Creation              0.0           0.0   
'Round Midnight                               1.0           1.0   
'Salem's Lot                         

In [84]:
# Wrap the preloaded array in a DataFrame
jaccard_similarity_df = pd.DataFrame(jaccard_similarity_array, index=movie_cross_table.index, columns=movie_cross_table.index)

# Find the values for the movie Thor
jaccard_similarity_series = jaccard_similarity_df.loc['Thor']

# Sort these values from highest to lowest
ordered_similarities = jaccard_similarity_series.sort_values(ascending = False)

# Print the results
print(ordered_similarities)

name
Free Fire                             1.0
The Professional: Golgo 13            1.0
The Mummy                             1.0
Sheena                                1.0
Green Lantern: First Flight           1.0
                                     ... 
Goodbye Girl, The                     0.0
Goodbye Charlie                       0.0
Good bye, Lenin!                      0.0
Good Year, A                          0.0
À nous la liberté (Freedom for Us)    0.0
Name: Thor, Length: 9460, dtype: float64


Analyze the similarity of movies based on the description.


                             Title                                               Plot
0   Ace Ventura: When Nature Calls  In the Himalayas, after a failed rescue missio...
1      Dracula: Dead and Loving It  Solicitor Thomas Renfield travels all the way ...
2      Father of the Bride Part II  The film begins five years after the events of...
3                       Four Rooms  The film is set on New Year's Eve, and starts ...
4                 Grumpier Old Men  The feud between Max (Walter Matthau) and John...
5                          Jumanji  In 1869, near Brantford, New Hampshire, two br...
6                     Sudden Death  Darren McCord (Jean-Claude Van Damme) is a Fre...
7                     Tom and Huck  The movie opens with Injun Joe (Eric Schweig) ...
8                        Toy Story  In a world where toys are living things who pr...
9                Waiting to Exhale  "Friends are the People who let you be yoursel...
10                       GoldenEye  In 1986, at Arkhangelsk, MI6 agents James Bond...
11                         Skyfall  MI6 agents James Bond and Eve Moneypenny pursu...

In [85]:
# this code doesn't run because the df_plot is not defined. The structure is provided above

from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate the vectorizer object to the vectorizer variable
vectorizer = TfidfVectorizer(min_df=2, max_df=0.7)

# Fit and transform the plot column
vectorized_data = vectorizer.fit_transform(df_plots['Plot'])

# Look at the features generated
print(vectorizer.get_feature_names())

NameError: name 'df_plots' is not defined

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate the vectorizer object and transform the plot column
vectorizer = TfidfVectorizer(max_df=0.7, min_df=2)
vectorized_data = vectorizer.fit_transform(df_plots['Plot']) 

# Create Dataframe from TF-IDFarray
tfidf_df = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names())

# Assign the movie titles to the index and inspect
tfidf_df.index = df_plots['Title']
print(tfidf_df.head())

In [None]:
# Import cosine_similarity measure
from sklearn.metrics.pairwise import cosine_similarity

# Create the array of cosine similarity values
cosine_similarity_array = cosine_similarity(tfidf_summary_df)

# Wrap the array in a pandas DataFrame
cosine_similarity_df = pd.DataFrame(cosine_similarity_array, index=tfidf_summary_df.index, columns=tfidf_summary_df.index)

# Print the top 5 rows of the DataFrame
print(cosine_similarity_df.head())

In [None]:
# Wrap the preloaded array in a DataFrame
cosine_similarity_df = pd.DataFrame(cosine_similarity_array, index=tfidf_summary_df.index, columns=tfidf_summary_df.index)

# Find the values for the movie Rio
cosine_similarity_series = cosine_similarity_df.loc['Rio']

# Sort these values highest to lowest
ordered_similarities = cosine_similarity_series.sort_values(ascending = False)

# Print the results
print(ordered_similarities)

User provile building

The tfidf_summary_df you have been working on in the last few exercises has been loaded for you. This contains a row per movie with their titles as the index and a column for each feature containing their respective TF-IDF score.

In [None]:
list_of_movies_enjoyed = ['Captain America: The First Avenger', 'Green Lantern', 'The Avengers']

# Create a subset of only the movies the user has enjoyed
movies_enjoyed_df = tfidf_summary_df.reindex(list_of_movies_enjoyed)

# Inspect the DataFrame
print(movies_enjoyed_df)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Find subset of tfidf_df that does not include movies in list_of_movies_enjoyed
tfidf_subset_df = tfidf_df.drop(list_of_movies_enjoyed, axis=0)

# Calculate the cosine_similarity and wrap it in a DataFrame
similarity_array = cosine_similarity(user_prof.values.reshape(1, -1), tfidf_subset_df)
similarity_df = pd.DataFrame(similarity_array.T, index=tfidf_subset_df.index, columns=["similarity_score"])

# Sort the values from high to low by the values in the similarity_score
sorted_similarity_df = similarity_df.sort_values(by="similarity_score", ascending=False)

# Inspect the most similar to the user preferences
print(sorted_similarity_df.head())

In [None]:
# <script.py> output:
#                                     similarity_score
#     Title                                           
#     21 Jump Street                             0.362
#     Thor                                       0.266
#     X-Men: First Class                         0.264
#     Transformers: Dark of the Moon             0.224
#     Beastly                                    0.180

# As you can see, the top recommendations are all action-packed blockbusters, similar to those previously enjoyed by the user.

## Collaborative Filtering

In [87]:
user_ratings = pd.read_csv('user_ratings.csv')
user_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [89]:
# Fill in missing values with 0
user_ratings_table_filled = user_ratings.fillna(0)

# Inspect the result
print(user_ratings_table_filled)

        userId  movieId  rating   timestamp                             title  \
0            1        1     4.0   964982703                  Toy Story (1995)   
1            5        1     4.0   847434962                  Toy Story (1995)   
2            7        1     4.5  1106635946                  Toy Story (1995)   
3           15        1     2.5  1510577970                  Toy Story (1995)   
4           17        1     4.5  1305696483                  Toy Story (1995)   
...        ...      ...     ...         ...                               ...   
100831     610   160341     2.5  1479545749                  Bloodmoon (1997)   
100832     610   160527     4.5  1479544998  Sympathy for the Underdog (1971)   
100833     610   160836     3.0  1493844794                     Hazard (2005)   
100834     610   163937     3.5  1493848789                Blair Witch (2016)   
100835     610   163981     3.5  1493850155                         31 (2016)   

                           

In [90]:
# Chapter3

# Get the average rating for each user 
avg_ratings = user_ratings_table.mean(axis=1)

# Center each users ratings around 0
user_ratings_table_centered = user_ratings_table.sub(avg_ratings, axis=0)

# Fill in the missing data with 0s
user_ratings_table_normed = user_ratings_table_centered.fillna(0)

NameError: name 'user_ratings_table' is not defined

In [91]:
from sklearn.metrics.pairwise import cosine_similarity

# Assign the arrays to variables
sw_IV = movie_ratings_centered.loc['Star Wars: Episode IV - A New Hope (1977)', :].values.reshape(1, -1)
sw_V = movie_ratings_centered.loc['Star Wars: Episode V - The Empire Strikes Back (1980)', :].values.reshape(1, -1)

# Find the similarity between two Star Wars movies
similarity_A = cosine_similarity(sw_IV, sw_V)
print(similarity_A)

NameError: name 'movie_ratings_centered' is not defined