In [None]:
import pandas as pd
import utilities.data_preprocessing as dp

# importing data into dataframes
movies_df = pd.read_csv("data/movies.csv")
tags_df = pd.read_csv("data/tags.csv")
links_df = pd.read_csv("data/links.csv")
ratings_df = pd.read_csv("data/ratings.csv")
movie_details_df = pd.read_csv("data/movie_details.csv")

### Fetch details of every movie **!!!DO NOT RUN IF FILE IS PRESENT!!!**

In [15]:
import utilities.data_preprocessing as dp
import importlib
importlib.reload(dp)

movie_list = movies_df['movieId'].tolist()

dp.process_movie_list(movie_list, "access_token.json", "data/movie_details.csv")

Movie details saved to data/movie_details.csv incrementally.


### *movies.csv* data file preprocessing

In [16]:
# checking data types
movies_df["movieId"] = dp.check_data_type(movies_df["movieId"], int)
movies_df["title"] = dp.check_data_type(movies_df["title"], str)
movies_df["genres"] = dp.check_data_type(movies_df["genres"], str)

movies_df = movies_df.dropna()

In [17]:
# making data more informative and convenient work with
movies_df["year"] = movies_df["title"].apply(lambda value: dp.extract_movie_year(value))
movies_df["title"] = movies_df["title"].apply(lambda value: dp.clean_movie_title(value))
movies_df["genres"] = movies_df["genres"].apply(lambda value: dp.extract_movie_genres(value))

movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995.0
4,5,Father of the Bride Part II,[Comedy],1995.0


### *tags.csv* data file preprocessing

In [18]:
# checking data types
tags_df["userId"] = dp.check_data_type(tags_df["userId"], int)
tags_df["movieId"] = dp.check_data_type(tags_df["movieId"], int)
tags_df["tag"] = dp.check_data_type(tags_df["tag"], str)
tags_df["timestamp"] = dp.check_data_type(tags_df["timestamp"], int)

tags_df = tags_df.dropna()

In [19]:
# making data more informative and convenient work with
tags_df["tag"] = tags_df["tag"].apply(lambda value: value.lower())

tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,22,26479,kevin kline,1583038886
1,22,79592,misogyny,1581476297
2,22,247150,acrophobia,1622483469
3,34,2174,music,1249808064
4,34,2174,weird,1249808102


### *links.csv* data file preprocessing

In [21]:
# checking data types
links_df["movieId"] = dp.check_data_type(links_df["movieId"], int)
links_df["imdbId"] = dp.check_data_type(links_df["imdbId"], int)
links_df["tmdbId"] = dp.check_data_type(links_df["tmdbId"], int)

links_df = links_df.dropna()

### *ratings.csv* data file preprocessing

In [22]:
# checking data types
ratings_df["userId"] = dp.check_data_type(ratings_df["userId"], int)
ratings_df["movieId"] = dp.check_data_type(ratings_df["movieId"], int)
ratings_df["rating"] = dp.check_data_type(ratings_df["rating"], float)
ratings_df["timestamp"] = dp.check_data_type(ratings_df["timestamp"], float)

links_df = links_df.dropna()

### Combine movies and their tags

In [20]:
movie_tags_dict = tags_df.groupby('movieId')["tag"].unique().apply(lambda group: group.tolist()).to_dict()
movies_df['tags'] = movies_df['movieId'].apply(lambda id: movie_tags_dict.get(id, []))

movies_df.head()

Unnamed: 0,movieId,title,genres,year,tags
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,"[children, disney, animation, pixar, funny, tu..."
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995.0,"[robin williams, fantasy, time travel, animals..."
2,3,Grumpier Old Men,"[Comedy, Romance]",1995.0,"[comedinha de velhinhos engraãƒâ§ada, comedinh..."
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995.0,"[characters, slurs, based on novel or book, ch..."
4,5,Father of the Bride Part II,[Comedy],1995.0,"[fantasy, pregnancy, remake, family, steve mar..."


### *movie_details.csv* file preprocessing

In [23]:
import utilities.data_preprocessing as dp
import pandas as pd
import importlib
import spacy
import nltk
from nltk.corpus import stopwords
importlib.reload(dp)

nlp = spacy.load("en_core_web_md")
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

movies_df = pd.merge(movies_df, movie_details_df, on="movieId", how="inner",suffixes=('_left', '_right'))

movies_df = movies_df.drop(columns=['title_right', 'genres_left'])
movies_df = movies_df.rename(columns={'title_left': 'title', 'genres_right': 'genres'})


movies_df['overview'] = movies_df['overview'].apply(
   lambda x: dp.preprocess_text(x, nlp, stop_words)
)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/naudotojas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Solution 1: Recommendations based on cosine similarity

### Combine all features into a single one

In [24]:
import utilities.data_preprocessing as dp
import pandas as pd
import importlib
importlib.reload(dp)

def combine_features(row):
   features = [
      str(row['title']), 
      str(row['release_date']),
      str(row['genres']), 
      str(row['production_companies']), 
      str(row['overview']), 
      " ".join(row['tags'])
   ]
   return " ".join(features)

movies_df['combined_features'] = movies_df.apply(combine_features, axis=1)
movies_df = movies_df.dropna(subset=['combined_features'])

print(movies_df[['movieId', 'title', 'combined_features']].head())

movies_df.to_csv("data/movies_with_combined_features.csv", index=False)

   movieId                        title  \
0        2                      Jumanji   
1        3             Grumpier Old Men   
2        5  Father of the Bride Part II   
3        6                         Heat   
4        8                 Tom and Huck   

                                   combined_features  
0  Jumanji 1988-10-21 Comedy, Drama, Romance, Cri...  
1  Grumpier Old Men 1986-10-17 Comedy, Drama, Rom...  
2  Father of the Bride Part II 1995-12-09 Comedy ...  
3  Heat 1993-10-15 Action, Crime, Thriller Largo ...  
4  Tom and Huck 2006-01-01 Documentary inLoops ti...  


## Vectorization of the combined features

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=6666)

movies_combined_df = pd.read_csv("data/movies_with_combined_features.csv")

vectorized_data = tfidf.fit_transform(movies_combined_df['combined_features'].values)
vectorized_dataframe = pd.DataFrame(vectorized_data.toarray(), index=movies_combined_df['combined_features'].index.tolist())
vectorized_dataframe.shape

(50404, 6666)

## Reducing the dimension of the vectorized data

In [27]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=3333)

reduced_data = svd.fit_transform(vectorized_dataframe)
reduced_data.shape

(50404, 3333)

# Cosine similarity recommendation

In [29]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(reduced_data)

In [35]:
def recommendation(movie_title):
   id_of_movie = movies_combined_df[movies_df['title'].str.contains(movie_title, case=False, na=False)].index[0]
   
   print("Input movie:")
   print(f"{'ID':<6} | {'Title':<50}")
   print(f"{'-'*6} | {'-'*50}")
   print(f"{movies_combined_df.iloc[id_of_movie].movieId:<6} | {movies_combined_df.iloc[id_of_movie].title:<50}")
   
   distances = similarity[id_of_movie]
   movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:10]
   
   print("\nRecommended Movies:")
   print(f"{'ID':<6} | {'Title':<50}")
   print(f"{'-'*6} | {'-'*50}")
   for i in movie_list:
      print(f"{movies_combined_df.iloc[i[0]].movieId:<6} | {movies_combined_df.iloc[i[0]].title:<50}")

recommendation("Star trek")


Input movie:
ID     | Title                                             
------ | --------------------------------------------------
329    | Star Trek: Generations                            

Recommended Movies:
ID     | Title                                             
------ | --------------------------------------------------
1375   | Star Trek III: The Search for Spock               
1374   | Star Trek II: The Wrath of Khan                   
1373   | Star Trek V: The Final Frontier                   
1371   | Star Trek: The Motion Picture                     
1372   | Star Trek VI: The Undiscovered Country            
2393   | Star Trek: Insurrection                           
1376   | Star Trek IV: The Voyage Home                     
166528 | Rogue One: A Star Wars Story                      
122912 | Avengers: Infinity War - Part I                   
