In [1]:
import pandas as pd
import utilities.data_preprocessing as dp

In [2]:
import pandas as pd
import utilities.data_preprocessing as dp

# importing data into dataframes
movies_df = pd.read_csv("data/movies.csv")
tags_df = pd.read_csv("data/tags.csv")
links_df = pd.read_csv("data/links.csv")
ratings_df = pd.read_csv("data/ratings.csv")

### *movies.csv* data file preprocessing

In [3]:
# checking data types
movies_df["movieId"] = dp.check_data_type(movies_df["movieId"], int)
movies_df["title"] = dp.check_data_type(movies_df["title"], str)
movies_df["genres"] = dp.check_data_type(movies_df["genres"], str)

movies_df = movies_df.dropna()

In [4]:
# making data more informative and convenient work with
movies_df["year"] = movies_df["title"].apply(lambda value: dp.extract_movie_year(value))
movies_df["title"] = movies_df["title"].apply(lambda value: dp.clean_movie_title(value))
movies_df["genres"] = movies_df["genres"].apply(lambda value: dp.extract_movie_genres(value))

movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995.0
4,5,Father of the Bride Part II,[Comedy],1995.0


### *tags.csv* data file preprocessing

In [5]:
# checking data types
tags_df["userId"] = dp.check_data_type(tags_df["userId"], int)
tags_df["movieId"] = dp.check_data_type(tags_df["movieId"], int)
tags_df["tag"] = dp.check_data_type(tags_df["tag"], str)
tags_df["timestamp"] = dp.check_data_type(tags_df["timestamp"], int)

tags_df = tags_df.dropna()

In [6]:
# making data more informative and convenient work with
tags_df["tag"] = tags_df["tag"].apply(lambda value: value.lower())

tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,22,26479,kevin kline,1583038886
1,22,79592,misogyny,1581476297
2,22,247150,acrophobia,1622483469
3,34,2174,music,1249808064
4,34,2174,weird,1249808102


### *links.csv* data file preprocessing

In [7]:
# checking data types
links_df["movieId"] = dp.check_data_type(links_df["movieId"], int)
links_df["imdbId"] = dp.check_data_type(links_df["imdbId"], int)
links_df["tmdbId"] = dp.check_data_type(links_df["tmdbId"], int)

links_df = links_df.dropna()

### *ratings.csv* data file preprocessing

In [8]:
# checking data types
ratings_df["userId"] = dp.check_data_type(ratings_df["userId"], int)
ratings_df["movieId"] = dp.check_data_type(ratings_df["movieId"], int)
ratings_df["rating"] = dp.check_data_type(ratings_df["rating"], float)
ratings_df["timestamp"] = dp.check_data_type(ratings_df["timestamp"], float)

links_df = links_df.dropna()

### Combine movies and their tags

In [9]:
movie_tags_dict = tags_df.groupby('movieId')["tag"].unique().apply(lambda group: group.tolist()).to_dict()
movies_df['tags'] = movies_df['movieId'].apply(lambda id: movie_tags_dict.get(id, []))

movies_df.head()

Unnamed: 0,movieId,title,genres,year,tags
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0,"[children, disney, animation, pixar, funny, tu..."
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995.0,"[robin williams, fantasy, time travel, animals..."
2,3,Grumpier Old Men,"[Comedy, Romance]",1995.0,"[comedinha de velhinhos engraãƒâ§ada, comedinh..."
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995.0,"[characters, slurs, based on novel or book, ch..."
4,5,Father of the Bride Part II,[Comedy],1995.0,"[fantasy, pregnancy, remake, family, steve mar..."


### Fetch the details of every movie

In [10]:
import utilities.data_preprocessing as dp
import importlib
import json
importlib.reload(dp)

with open("access_token.json", 'r') as file:
        data = json.load(file)
        access_token = data.get("access_token")

response = dp.get_movie_details(19,access_token)

print(response)

{'adult': False, 'backdrop_path': '/eeMoFKxjjiCi6iep2GEZtSAMYIr.jpg', 'belongs_to_collection': None, 'budget': 5300000, 'genres': [{'id': 18, 'name': 'Drama'}, {'id': 878, 'name': 'Science Fiction'}], 'homepage': '', 'id': 19, 'imdb_id': 'tt0017136', 'origin_country': ['DE'], 'original_language': 'de', 'original_title': 'Metropolis', 'overview': "In a futuristic city sharply divided between the rich and the poor, the son of the city's mastermind meets a prophet who predicts the coming of a savior to mediate their differences.", 'popularity': 19.965, 'poster_path': '/vZIJxGnjcswPCAa52jhbl01FQkV.jpg', 'production_companies': [{'id': 12372, 'logo_path': '/lcQScIQXHQXuLMLFQt0loJl5gWJ.png', 'name': 'UFA', 'origin_country': 'DE'}], 'production_countries': [{'iso_3166_1': 'DE', 'name': 'Germany'}], 'release_date': '1927-02-06', 'revenue': 1350322, 'runtime': 148, 'spoken_languages': [{'english_name': 'No Language', 'iso_639_1': 'xx', 'name': 'No Language'}], 'status': 'Released', 'tagline': '

## **!!!DO NOT RUN IF FILE IS PRESENT!!!**

In [12]:
import utilities.data_preprocessing as dp
import importlib
importlib.reload(dp)

movie_list = movies_df['movieId'].tolist()

dp.process_movie_list(movie_list, "access_token.json", "data/movie_details.csv")

Movie details saved to data/movie_details.csv incrementally.


# Solution 1: Recommendations based on cosine similarity

### Combine all features into a single one

In [11]:
import utilities.data_preprocessing as dp
import pandas as pd
import importlib
import spacy
import nltk
from nltk.corpus import stopwords
importlib.reload(dp)

nlp = spacy.load("en_core_web_md")
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


movies_df = pd.read_csv("data/movies.csv")
movie_details_df = pd.read_csv("data/movie_details.csv")
tags_df = pd.read_csv("data/tags.csv")


tags_df["tag"] = dp.check_data_type(tags_df["tag"], str)
tags_df = tags_df.dropna()
tags_df["tag"] = tags_df["tag"].apply(lambda value: value.lower())


movie_tags_dict = tags_df.groupby('movieId')["tag"].unique().apply(lambda group: group.tolist()).to_dict()
movies_df['tags'] = movies_df['movieId'].apply(lambda id: movie_tags_dict.get(id, []))


movies_df = pd.merge(movies_df, movie_details_df, on="movieId", how="inner",suffixes=('_left', '_right'))

movies_df = movies_df.drop(columns=['title_right', 'genres_left'])
movies_df = movies_df.rename(columns={'title_left': 'title', 'genres_right': 'genres'})


movies_df['overview'] = movies_df['overview'].apply(
   lambda x: dp.preprocess_text(x, nlp, stop_words)
)


def combine_features(row):
   features = [
      str(row['title']), 
      str(row['genres']), 
      str(row['production_companies']), 
      str(row['overview']), 
      " ".join(row['tags'])
   ]
   return " ".join(features)

movies_df['combined_features'] = movies_df.apply(combine_features, axis=1)
movies_df = movies_df.dropna(subset=['combined_features'])

print(movies_df[['movieId', 'title', 'combined_features']].head())

movies_df.to_csv("data/movies_with_combined_features.csv", index=False)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gabri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   movieId                               title  \
0        2                      Jumanji (1995)   
1        3             Grumpier Old Men (1995)   
2        5  Father of the Bride Part II (1995)   
3        6                         Heat (1995)   
4        8                 Tom and Huck (1995)   

                                   combined_features  
0  Jumanji (1995) Comedy, Drama, Romance, Crime V...  
1  Grumpier Old Men (1995) Comedy, Drama, Romance...  
2  Father of the Bride Part II (1995) Comedy Mira...  
3  Heat (1995) Action, Crime, Thriller Largo Ente...  
4  Tom and Huck (1995) Documentary inLoops timo n...  


## Vectorization of the combined features

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=6666)

vectorized_data = tfidf.fit_transform(movies_df['combined_features'].values)
vectorized_dataframe = pd.DataFrame(vectorized_data.toarray(), index=movies_df['combined_features'].index.tolist())
vectorized_dataframe.shape

(50404, 6666)

## Reducing the dimension of the vectorized data

In [13]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=2222)

reduced_data = svd.fit_transform(vectorized_dataframe)
reduced_data.shape

(50404, 2222)

# Cosine similarity recommendation

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(reduced_data)

def recommendation(movie_title):
   id_of_movie = movies_df[movies_df['title'].str.contains(movie_title,case=False)].index[0]
   distances = similarity[id_of_movie]
   movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:10]
   
   for i in movie_list:
      print(movies_df.iloc[i[0]].title)

recommendation("Star wars")

2001: A Space Odyssey (1968)
Rogue One: A Star Wars Story (2016)
Star Wars: Episode VII - The Force Awakens (2015)
Lord of the Rings: The Fellowship of the Ring, The (2001)
Lord of the Rings: The Two Towers, The (2002)
Gravity (2013)
Space Sweepers (2021)
Jupiter Ascending (2015)
Dark Star (1974)


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

ratings_df = pd.read_csv("data/ratings.csv")
tags_df = pd.read_csv("data/tags.csv")
movies_df = pd.read_csv("data/movies.csv")

tags_df['tag'] = tags_df['tag'].fillna("").astype(str)

tags_combined = tags_df.groupby('movieId')['tag'].apply(lambda tags: " ".join(tags)).reset_index()

movies_with_tags = pd.merge(movies_df, tags_combined, on='movieId', how='left')
movies_with_tags['tag'] = movies_with_tags['tag'].fillna("")

In [43]:
def get_user_liked_movies(user_id, threshold=4.0):
    liked_movies = ratings_df[(ratings_df['userId'] == user_id) & (ratings_df['rating'] >= threshold)]['movieId']
    return movies_with_tags[movies_with_tags['movieId'].isin(liked_movies)]


In [None]:
from collections import Counter

def build_user_profile(liked_movies, top_n_tags=5, max_repeats=5):
    tags_combined = " ".join(liked_movies['tag'])
    tag_counts = Counter(tags_combined.split())
    
    limited_tag_counts = {tag: min(count, max_repeats) for tag, count in tag_counts.items()}
    
    limited_tags = " ".join([tag for tag, count in limited_tag_counts.items() for _ in range(count)])
    
    most_common_tags = Counter(limited_tags.split()).most_common(top_n_tags)
    user_profile = " ".join([tag for tag, _ in most_common_tags])
    
    return user_profile


In [None]:
def build_user_profile_normalized(liked_movies, top_n_tags=5):
    tags_combined = " ".join(liked_movies['tag'])
    tag_counts = Counter(tags_combined.split())
    
    total_tags = sum(tag_counts.values())
    normalized_tags = {tag: count / total_tags for tag, count in tag_counts.items()}
    
    most_common_tags = sorted(normalized_tags.items(), key=lambda x: x[1], reverse=True)[:top_n_tags]
    user_profile = " ".join([tag for tag, _ in most_common_tags])
    
    return user_profile


In [None]:
def calculate_similarity(user_profile, movie_tags):
    unique_user_tags = set(user_profile.split())
    tag_weights = {tag: user_profile.split().count(tag) ** -0.5 for tag in unique_user_tags}
    
    overlap_score = sum(tag_weights.get(tag, 0) for tag in movie_tags.split())
    return overlap_score


In [None]:
def recommend_movies_by_user_tags(user_id, top_n=5, top_n_tags=50, verbose=False):
    liked_movies = get_user_liked_movies(user_id)
    if verbose:
        print(f"Movies liked by User {user_id}:\n{liked_movies[['title', 'tag']]}\n")
    
    user_profile = build_user_profile(liked_movies, top_n_tags=top_n_tags)
    if verbose:
        print(f"User {user_id} preference profile (tags combined):\n{user_profile}\n")
    
    tfidf = TfidfVectorizer(max_features=5000)
    tfidf_input = pd.concat([movies_with_tags['tag'], pd.Series(user_profile)], ignore_index=True)
    tfidf_matrix = tfidf.fit_transform(tfidf_input)
    
    user_profile_vector = tfidf_matrix[-1] 
    movie_vectors = tfidf_matrix[:-1]    
    similarity_scores = cosine_similarity(user_profile_vector, movie_vectors)[0]
    
    movies_with_tags['similarity'] = similarity_scores
    recommended_movies = movies_with_tags[~movies_with_tags['movieId'].isin(liked_movies['movieId'])]
    recommended_movies = recommended_movies.sort_values(by='similarity', ascending=False).head(top_n)
    
    return recommended_movies[['title', 'similarity']]


In [None]:
recommendations = recommend_movies_by_user_tags(user_id=22, top_n=5, top_n_tags=10, verbose=True)

print("Recommended Movies for User 22:")
print(recommendations)


Movies liked by User 22:
                                                   title  \
257            Star Wars: Episode IV - A New Hope (1977)   
314                     Shawshank Redemption, The (1994)   
466                          Hudsucker Proxy, The (1994)   
545               Nightmare Before Christmas, The (1993)   
729               Wallace & Gromit: A Close Shave (1995)   
1108              Monty Python and the Holy Grail (1975)   
1120         Wallace & Gromit: The Wrong Trousers (1993)   
1166   Star Wars: Episode V - The Empire Strikes Back...   
1167                          Princess Bride, The (1987)   
1168   Raiders of the Lost Ark (Indiana Jones and the...   
1173                                 12 Angry Men (1957)   
1358                              Raising Arizona (1987)   
1527                                      Contact (1997)   
1666                            Big Lebowski, The (1998)   
3924                   O Brother, Where Art Thou? (2000)   
4868   Amelie (