In [5]:
import pandas as pd
import utilities.data_preprocessing as dp

In [7]:
import pandas as pd
import utilities.data_preprocessing as dp

# importing data into dataframes
movies_df = pd.read_csv("data/movies.csv")
tags_df = pd.read_csv("data/tags.csv")
links_df = pd.read_csv("data/links.csv")
ratings_df = pd.read_csv("data/ratings.csv")

### *movies.csv* data file preprocessing

In [8]:
# checking data types
movies_df["movieId"] = dp.check_data_type(movies_df["movieId"], int)
movies_df["title"] = dp.check_data_type(movies_df["title"], str)
movies_df["genres"] = dp.check_data_type(movies_df["genres"], str)

movies_df = movies_df.dropna()

In [9]:
# making data more informative and convenient work with
movies_df["year"] = movies_df["title"].apply(lambda value: dp.extract_movie_year(value))
movies_df["title"] = movies_df["title"].apply(lambda value: dp.clean_movie_title(value))
movies_df["genres"] = movies_df["genres"].apply(lambda value: dp.extract_movie_genres(value))

movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995.0
4,5,Father of the Bride Part II,[Comedy],1995.0


### *tags.csv* data file preprocessing

In [10]:
# checking data types
tags_df["userId"] = dp.check_data_type(tags_df["userId"], int)
tags_df["movieId"] = dp.check_data_type(tags_df["movieId"], int)
tags_df["tag"] = dp.check_data_type(tags_df["tag"], str)
tags_df["timestamp"] = dp.check_data_type(tags_df["timestamp"], int)

tags_df = tags_df.dropna()

In [11]:
# making data more informative and convenient work with
tags_df["tag"] = tags_df["tag"].apply(lambda value: value.lower())

tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,22,26479,kevin kline,1583038886
1,22,79592,misogyny,1581476297
2,22,247150,acrophobia,1622483469
3,34,2174,music,1249808064
4,34,2174,weird,1249808102


### *links.csv* data file preprocessing

In [12]:
# checking data types
links_df["movieId"] = dp.check_data_type(links_df["movieId"], int)
links_df["imdbId"] = dp.check_data_type(links_df["imdbId"], int)
links_df["tmdbId"] = dp.check_data_type(links_df["tmdbId"], int)

links_df = links_df.dropna()

### *ratings.csv* data file preprocessing

In [13]:
# checking data types
ratings_df["userId"] = dp.check_data_type(ratings_df["userId"], int)
ratings_df["movieId"] = dp.check_data_type(ratings_df["movieId"], int)
ratings_df["rating"] = dp.check_data_type(ratings_df["rating"], float)
ratings_df["timestamp"] = dp.check_data_type(ratings_df["timestamp"], float)

links_df = links_df.dropna()

### Combine movies and their tags

In [14]:
movie_tags_dict = tags_df.groupby('movieId')["tag"].unique().apply(lambda group: group.tolist()).to_dict()
movies_df['tags'] = movies_df['movieId'].apply(lambda id: movie_tags_dict.get(id, []))

movies_df.head()

Unnamed: 0,movieId,title,genres,year,tags
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,"[children, disney, animation, pixar, funny, tu..."
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995.0,"[robin williams, fantasy, time travel, animals..."
2,3,Grumpier Old Men,"[Comedy, Romance]",1995.0,"[comedinha de velhinhos engraãƒâ§ada, comedinh..."
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995.0,"[characters, slurs, based on novel or book, ch..."
4,5,Father of the Bride Part II,[Comedy],1995.0,"[fantasy, pregnancy, remake, family, steve mar..."


### Fetch the details of every movie

In [50]:
import utilities.data_preprocessing as dp
import importlib
import json
importlib.reload(dp)

with open("access_token.json", 'r') as file:
        data = json.load(file)
        access_token = data.get("access_token")

response = dp.get_movie_details(19,access_token)

print(response)

{'adult': False, 'backdrop_path': '/eeMoFKxjjiCi6iep2GEZtSAMYIr.jpg', 'belongs_to_collection': None, 'budget': 5300000, 'genres': [{'id': 18, 'name': 'Drama'}, {'id': 878, 'name': 'Science Fiction'}], 'homepage': '', 'id': 19, 'imdb_id': 'tt0017136', 'origin_country': ['DE'], 'original_language': 'de', 'original_title': 'Metropolis', 'overview': "In a futuristic city sharply divided between the rich and the poor, the son of the city's mastermind meets a prophet who predicts the coming of a savior to mediate their differences.", 'popularity': 28.489, 'poster_path': '/vZIJxGnjcswPCAa52jhbl01FQkV.jpg', 'production_companies': [{'id': 12372, 'logo_path': '/lcQScIQXHQXuLMLFQt0loJl5gWJ.png', 'name': 'UFA', 'origin_country': 'DE'}], 'production_countries': [{'iso_3166_1': 'DE', 'name': 'Germany'}], 'release_date': '1927-02-06', 'revenue': 1350322, 'runtime': 148, 'spoken_languages': [{'english_name': 'No Language', 'iso_639_1': 'xx', 'name': 'No Language'}], 'status': 'Released', 'tagline': '

## **!!!DO NOT RUN IF FILE IS PRESENT!!!**

In [12]:
import utilities.data_preprocessing as dp
import importlib
importlib.reload(dp)

movie_list = movies_df['movieId'].tolist()

dp.process_movie_list(movie_list, "access_token.json", "data/movie_details.csv")

Movie details saved to data/movie_details.csv incrementally.


# Solution 1: Recommendations based on cosine similarity

### Combine all features into a single one

In [4]:
import utilities.data_preprocessing as dp
import pandas as pd
import importlib
importlib.reload(dp)

movies_df = pd.read_csv("data/movies.csv")
movie_details_df = pd.read_csv("data/movie_details.csv")
tags_df = pd.read_csv("data/tags.csv")

tags_df["tag"] = dp.check_data_type(tags_df["tag"], str)
tags_df = tags_df.dropna()
tags_df["tag"] = tags_df["tag"].apply(lambda value: value.lower())

movie_tags_dict = tags_df.groupby('movieId')["tag"].unique().apply(lambda group: group.tolist()).to_dict()
movies_df['tags'] = movies_df['movieId'].apply(lambda id: movie_tags_dict.get(id, []))

movies_df = pd.merge(movies_df, movie_details_df, on="movieId", how="inner",suffixes=('_left', '_right'))

movies_df = movies_df.drop(columns=['title_right', 'genres_left'])
movies_df = movies_df.rename(columns={'title_left': 'title', 'genres_right': 'genres'})


def combine_features(row):
   features = [
      str(row['title']), 
      str(row['genres']), 
      str(row['production_companies']), 
      str(row['overview']), 
      " ".join(row['tags'])
   ]
   return " ".join(features)

movies_df['combined_features'] = movies_df.apply(combine_features, axis=1)
movies_df = movies_df.dropna(subset=['combined_features'])

print(movies_df[['movieId', 'title', 'combined_features']].head())

movies_df.to_csv("data/movies_with_combined_features.csv", index=False)

   movieId                               title  \
0        2                      Jumanji (1995)   
1        3             Grumpier Old Men (1995)   
2        5  Father of the Bride Part II (1995)   
3        6                         Heat (1995)   
4        8                 Tom and Huck (1995)   

                                   combined_features  
0  Jumanji (1995) Comedy, Drama, Romance, Crime V...  
1  Grumpier Old Men (1995) Comedy, Drama, Romance...  
2  Father of the Bride Part II (1995) Comedy Mira...  
3  Heat (1995) Action, Crime, Thriller Largo Ente...  
4  Tom and Huck (1995) Documentary inLoops Timo N...  


## Vectorization of the combined features

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=6666)

vectorized_data = tfidf.fit_transform(movies_df['combined_features'].values)
vectorized_dataframe = pd.DataFrame(vectorized_data.toarray(), index=movies_df['combined_features'].index.tolist())
vectorized_dataframe.shape

(50404, 6666)

## Reducing the dimension of the vectorized data

In [6]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=3333)

reduced_data = svd.fit_transform(vectorized_dataframe)
reduced_data.shape

(50404, 3333)

# Cosine similarity recommendation

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(reduced_data)

def recommendation(movie_title):
   id_of_movie = movies_df[movies_df['title'].str.contains(movie_title,case=False)].index[0]
   distances = similarity[id_of_movie]
   movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:10]
   
   for i in movie_list:
      print(movies_df.iloc[i[0]].title)

recommendation("Star trek")

Star Trek III: The Search for Spock (1984)
Star Trek II: The Wrath of Khan (1982)
Star Trek V: The Final Frontier (1989)
Star Trek: The Motion Picture (1979)
Star Trek VI: The Undiscovered Country (1991)
Star Trek: Insurrection (1998)
Star Trek IV: The Voyage Home (1986)
Rogue One: A Star Wars Story (2016)
Avengers: Infinity War - Part I (2018)
