In [None]:
!pip install surprise



In [None]:
import pandas as pd
import numpy as np
import math
import re
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
import json

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# load data from netflix dataset, append it to one dataframe
# skip date column
df1 = pd.read_csv('/content/drive/Shareddrives/COMSC341_data/netflix/combined_data_1.txt', header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1])
df2 = pd.read_csv('/content/drive/Shareddrives/COMSC341_data/netflix/combined_data_2.txt', header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1])
df3 = pd.read_csv('/content/drive/Shareddrives/COMSC341_data/netflix/combined_data_3.txt', header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1])
df4 = pd.read_csv('/content/drive/Shareddrives/COMSC341_data/netflix/combined_data_4.txt', header = None, names = ['Cust_Id', 'Rating'], usecols = [0,1])

df1['Rating'] = df1['Rating'].astype(float)
df2['Rating'] = df2['Rating'].astype(float)
df3['Rating'] = df3['Rating'].astype(float)
df4['Rating'] = df4['Rating'].astype(float)

df = df1.append(df2, ignore_index=True)
df = df.append(df3, ignore_index=True)
df = df.append(df4, ignore_index=True)

  df = df1.append(df2, ignore_index=True)


In [None]:
# data cleaning
# Goal: only get movies that are both in IMDB and Netflix dataset / only save columns we actually need

df_nan = pd.DataFrame(pd.isnull(df['Rating']))
df_nan = df_nan[df_nan['Rating'] == True]
df_nan = df_nan.reset_index()

movie_np = []
movie_id = 1

for i, j in zip(df_nan['index'][1:], df_nan['index'][:-1]):
    # Check if i is greater than j
    if i > j:
        # numpy approach
        temp = np.full((1, i - j - 1), movie_id)
        movie_np = np.append(movie_np, temp)
        movie_id += 1

# Account for last record and corresponding length
# numpy approach
last_record = np.full((1, len(df) - df_nan.iloc[-1, 0] - 1), movie_id)
movie_np = np.append(movie_np, last_record)


In [None]:
df = df[pd.notnull(df['Rating'])]

df['Movie_Id'] = movie_np.astype(int)
df['Cust_Id'] = df['Cust_Id'].astype(int)
print('-Dataset examples-')
print(df.iloc[::5000000, :])

-Dataset examples-
          Cust_Id  Rating  Movie_Id
1         1488844     3.0         1
5000996    501954     2.0       996
10001962   404654     5.0      1962
15002876   886608     2.0      2876
20003825  1193835     2.0      3825
25004661  1899206     3.0      4661
30005496   154804     4.0      5496
35006274  2078749     5.0      6274
40007057   450763     5.0      7057
45007991   102092     3.0      7991
50009023   220298     5.0      9023


In [None]:
f = ['count','mean']

df_movie_summary = df.groupby('Movie_Id')['Rating'].agg(f)
df_movie_summary.index = df_movie_summary.index.map(int)
movie_benchmark = round(df_movie_summary['count'].quantile(0.7),0)
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index

print('Movie minimum times of review: {}'.format(movie_benchmark))

df_cust_summary = df.groupby('Cust_Id')['Rating'].agg(f)
df_cust_summary.index = df_cust_summary.index.map(int)
cust_benchmark = round(df_cust_summary['count'].quantile(0.7),0)
drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index

print('Customer minimum times of review: {}'.format(cust_benchmark))

Movie minimum times of review: 1932.0
Customer minimum times of review: 108.0


In [None]:
print('Original Shape: {}'.format(df.shape))
df = df[~df['Movie_Id'].isin(drop_movie_list)]
df = df[~df['Cust_Id'].isin(drop_cust_list)]
print('After Trim Shape: {}'.format(df.shape))
print('-Data Examples-')
print(df.iloc[::5000000, :])

Original Shape: (51031355, 3)
After Trim Shape: (36695974, 3)
-Data Examples-
          Cust_Id  Rating  Movie_Id
696        712664     5.0         3
6967414   1772050     5.0      1401
13917453   439970     4.0      2662
20846898  1179317     5.0      3925
27821028  2556129     4.0      5137
34868052  1637761     4.0      6247
41920351  1566168     3.0      7399
48773721     6173     3.0      8782


In [None]:
df_p = pd.pivot_table(df,values='Rating',index='Cust_Id',columns='Movie_Id')

print(df_p.shape)

In [None]:
# Finding the overlapping set of movies between the netflix and movielens dataset

# Boolean field defines if the movie is in both netflix and movielens dataset
# Load the data
df_movielens = pd.read_csv('/content/drive/Shareddrives/COMSC341_data/movielens/movies_metadata.csv', usecols=['original_title', 'overview', 'genres'])
df_title = pd.read_csv('/content/drive/Shareddrives/COMSC341_data/netflix/movie_titles.csv', encoding="ISO-8859-1", header=None, on_bad_lines='skip', names=['Movie_Id', 'Year', 'Name'])
df_title["Valid"] = False  # Initialize the 'valid' column to False

# Perform case-folding on movie titles
df_movielens['original_title'] = df_movielens['original_title'].str.lower()
df_title['Name'] = df_title['Name'].str.lower()

# Process the genre column
df_movielens['genres'] = df_movielens['genres'].str.replace("'", "\"")
df_movielens['genres'] = df_movielens['genres'].apply(lambda x: json.loads(x) if pd.notnull(x) else [])
df_movielens['genre_name'] = df_movielens['genres'].apply(lambda genres: [genre['name'] for genre in genres])

# Create a DataFrame with matching titles
matching_titles_df = df_title[df_title['Name'].isin(df_movielens['original_title'])]

# Set the 'valid' column to True for matching titles
df_title.loc[matching_titles_df.index, 'Valid'] = True

for index, row in df_title.iterrows():
    # Check if the movie title exists in MovieLens dataset
    matching_movie = df_movielens[df_movielens['original_title'] == row['Name']]

    # If a match is found, update the 'Valid' column and store the 'overview'
    if not matching_movie.empty:
        df_title.at[index, 'Valid'] = True
        df_title.at[index, 'overview'] = matching_movie['overview'].values[0]

total_movies = len(df_title)
num_valid_movies = df_title['Valid'].sum()

# Check the number of movies we're looking into
print("Total number of movies:", total_movies)
print("Number of movies that are both in Netflix set and movielens set:", num_valid_movies)

# When recommender is pulling movies, if the field is false, it should not be included
valid_movie_ids = df_title[df_title['Valid']]['Movie_Id']
valid_movie_ids = valid_movie_ids[valid_movie_ids.isin(df_p.columns)]  # Filter only valid movie IDs present in df_p columns

df_p_filtered = df_p[valid_movie_ids]
df_p_filtered = df_p_filtered.join(df_movielens.set_index('original_title')['overview'])

print(df_p_filtered)

In [None]:
final_df = df_title[df_title['Valid']==True]

In [None]:
# Baseline model - Pearson's correlation model
def recommend(movie_title, min_count):
    movie_title = movie_title.lower()
    i = final_df.loc[final_df['Name'] == movie_title, 'Movie_Id'].values[0]
    if i in df_p_filtered:
        target = df_p_filtered[i]
        similar_to_target = df_p_filtered.corrwith(target)
        corr_target = pd.DataFrame(similar_to_target, columns = ['PearsonR'])
        corr_target.dropna(inplace = True)
        corr_target = corr_target.sort_values('PearsonR', ascending = False)
        corr_target.index = corr_target.index.map(int)
        corr_target = corr_target.join(df_title).join(df_movie_summary)[['Movie_Id', 'PearsonR', 'Name', 'count', 'mean', 'overview']]
        recommended_movies_df = corr_target.copy()
        return recommended_movies_df.head(20)
    else:
        print("Movie not found in the database")
        return pd.DataFrame()

In [None]:
# Our improved recommender system

In [None]:
!pip install openai==0.28

Collecting openai==0.28
  Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/76.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━[0m [32m71.7/76.5 kB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llmx 0.0.15a0 requires cohere, which is not installed.
llmx 0.0.15a0 requires tiktoken, which is not installed.[0m[31m
[0mSuccessfully installed openai-0.28.0


In [None]:
import openai

# TODO: INSERT OPEN AI API KEY BEFORE TRYING TO RUN THE CODE
openai.api_key = ''

# Use ChatGPT to get an adjective that describes the "vibe" of the movie based on its summary
def chat_with_gpt(movie_summary):
    prompt = "Given text is a summary of a movie, can you come up with one word, it can be anything, preferrably adjective, that reflects the movie the best based on vibes? With heavy emphasis on vibes."
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": movie_summary},
    ]
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
    )
    return response['choices'][0]['message']['content'].strip()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
from nltk.tokenize import word_tokenize
import time

In [None]:
# Using GloVe model for Word2vec and then calculating the cosine similarity
def load_glove_model():
    # Load GloVe model
    glove_model_path = "/content/drive/Shareddrives/COMSC341_data/glove.6B/glove.6B.300d.txt"
    glove_model = KeyedVectors.load_word2vec_format(glove_model_path, binary=False, no_header=True)
    return glove_model

def calc_cosine_similarity(word1, word2, glove_model):
    tokens_word1 = word_tokenize(word1.lower())
    tokens_word2 = word_tokenize(word2.lower())
    try:
        # Get word vectors
        vector1 = glove_model[tokens_word1[0]]
        vector2 = glove_model[tokens_word2[0]]
    except KeyError as e:
        raise ValueError(f"One or both words not in vocabulary: {e}")
    # Calculate cosine similarity
    similarity_score = glove_model.similarity(tokens_word1[0], tokens_word2[0])
    return similarity_score

In [None]:
# Creating a vector space based on the sentiment tag and the word that chatGPT gave
def vector_space(baseline_df_filtered, your_vibe_word, user_movie):
  glove_model = load_glove_model()
  weight = 0.3 # A constant value that can be changed
  for index, row in baseline_df_filtered.iterrows():
        try :
            chatgpt_word = chat_with_gpt(row['overview'])
            cosine_sim = calc_cosine_similarity(your_vibe_word, chatgpt_word, glove_model)

            # Genre filtering
            user = df_movielens[df_movielens['original_title'] == user_movie]
            current_movie = df_movielens[df_movielens['original_title'] == row["Name"]]
            user_genre = user['genre_name'].tolist()
            current_genre = current_movie['genre_name'].tolist()
            if user_genre and current_genre:  # Check if both lists are non-empty
                user_genres = user_genre[0]
                current_genres = current_genre[0]
                if any(genre in current_genres for genre in user_genres):
                    cosine_sim += weight

            baseline_df_filtered.at[index, 'cosine_score'] = cosine_sim
        except openai.error.RateLimitError as e:
        # Handle rate limit error
            print(f"Rate limit reached. Waiting for 20 seconds and then retrying.")
            time.sleep(21)
  return baseline_df_filtered

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Movie Recommender System
def main():
  print(" ----- MOVIE RECOMMENDER SYSTEM ----- ")
  movie_name = input("Enter a movie name that matches what you want to watch : ")
  your_vibe_word = ""
  print("What describes the vibe that you're going for?")
  print("1. Something exciting that will make my heart race")
  print("2. Because who needs logic when you have love?")
  print("3. Something cozy, something heartwarming")
  print("4. Screaming, crying, throwing up")
  print("5. Something silly, goof that will make me laugh")
  user_choice = input("Your choice (1-5): ")
  if user_choice == "1" :
    your_vibe_word = "exciting"
  elif user_choice == "2" :
    your_vibe_word = "romantic"
  elif user_choice == "3" :
    your_vibe_word = "comfort"
  elif user_choice == "4" :
    your_vibe_word = "angsty"
  elif user_choice == "5" :
    your_vibe_word = "silly"
  else:
    print("ERROR: Please enter a number between 1-5")
  print("your_vibe_word : " + your_vibe_word)
  baseline_df = recommend(movie_name , 0)
  # Display the movies that are being recommended by the baseline model
  print("Recommendations based on the Pearson's correlation model")
  display(baseline_df.head(5))
  # Filtering out the df where 'overview' is None for now, we can change it later!
  baseline_df_filtered = baseline_df[baseline_df['overview'].notna()]
  baseline_df_filtered['cosine_score'] = 0.0
  new_df = vector_space(baseline_df_filtered, your_vibe_word, movie_name)
  # Sorting the output dataframe by its cosine similarities and then displaying the top 5
  result = new_df.sort_values(by='cosine_score', ascending=False)
  print("Recommendations based on our improved model")
  display(result.head(5))

In [None]:
if __name__ == "__main__":
  main()

 ----- MOVIE RECOMMENDER SYSTEM ----- 
Enter a movie name that matches what you want to watch : screamers
What describes the vibe that you're going for?
1. Something exciting that will make my heart race
2. <THERE WILL BE A COUPLE MORE OF THESE>
Your choice: 1
your_vibe_word : exciting


  similar_to_target = df_p_filtered.corrwith(target)


Recommendations based on the Pearson's correlation model


Unnamed: 0,Movie_Id,PearsonR,Name,count,mean,overview
16,17,1.0,7 seconds,2699,3.098555,When an experienced thief accidentally makes o...
4515,4603,1.0,on our merry way,2699,3.098555,Oliver Pease gets a dose of courage from his w...
4442,4529,0.680614,how's your news?,2659,2.945844,
8910,9073,0.583576,dawg,2214,3.512647,"To inherit a million dollars, an egotistical m..."
748,760,0.576678,pooh's heffalump movie,3637,3.741545,Who or what exactly is a Heffalump? The lovabl...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  baseline_df_filtered['cosine_score'] = 0.0


Rate limit reached. Waiting for 20 seconds and then retrying.
Rate limit reached. Waiting for 20 seconds and then retrying.
Rate limit reached. Waiting for 20 seconds and then retrying.
Rate limit reached. Waiting for 20 seconds and then retrying.
Rate limit reached. Waiting for 20 seconds and then retrying.
Recommendations based on our improved model


Unnamed: 0,Movie_Id,PearsonR,Name,count,mean,overview,cosine_score
1160,1184,0.51646,the wizard of oz,3734,3.400375,Young Dorothy finds herself in a magical world...,0.354264
16,17,1.0,7 seconds,2699,3.098555,When an experienced thief accidentally makes o...,0.280415
3559,3629,0.575212,the boys club,2368,3.589527,Three teenage boys in small-town Southern Onta...,0.280415
2467,2510,0.545701,joe the king,2396,2.548831,A destitute 14 year old struggles to keep his ...,0.272254
4684,4774,0.51501,moscow on the hudson,2297,3.057466,A Russian circus visits the US. A clown wants ...,0.24742
