In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [6]:

import numpy as np
from sklearn.metrics import pairwise_distances

In [7]:
import pandas as pd
movies_df = pd.read_csv('movies.csv', sep='\t', encoding='latin-1', usecols=['movie_id', 'title', 'genres'])

In [8]:
movies_df.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
all_genres = movies_df['genres'].str.split('|').explode()

# Get unique genres from the list
unique_genres = all_genres.unique()

# Count the number of unique genres
num_unique_genres = len(unique_genres)
num_unique_genres, unique_genres

(18,
 array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
        'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
        'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
        'Western'], dtype=object))

In [10]:
movies_df['genres'] = movies_df['genres'].str.replace('|', ' ')
movies_df

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation Children's Comedy
1,2,Jumanji (1995),Adventure Children's Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [11]:
movies_df['genres'].fillna("").astype('str')

0        Animation Children's Comedy
1       Adventure Children's Fantasy
2                     Comedy Romance
3                       Comedy Drama
4                             Comedy
                    ...             
3878                          Comedy
3879                           Drama
3880                           Drama
3881                           Drama
3882                  Drama Thriller
Name: genres, Length: 3883, dtype: object

In [12]:
ratings_df = pd.read_csv('ratings.csv', sep='\t', encoding='latin-1')

In [13]:
ratings_df

Unnamed: 0.1,Unnamed: 0,user_id,movie_id,rating,timestamp,user_emb_id,movie_emb_id
0,0,1,1193,5,978300760,0,1192
1,1,1,661,3,978302109,0,660
2,2,1,914,3,978301968,0,913
3,3,1,3408,4,978300275,0,3407
4,4,1,2355,5,978824291,0,2354
...,...,...,...,...,...,...,...
1000204,1000204,6040,1091,1,956716541,6039,1090
1000205,1000205,6040,1094,5,956704887,6039,1093
1000206,1000206,6040,562,5,956704746,6039,561
1000207,1000207,6040,1096,4,956715648,6039,1095


# 1) Changing input data for TFIDF

## TFIDF TRAINED ON ONLY MOVIE GENRE

In [14]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 1),min_df=0.0)

In [15]:
tfidf_matrix_genres = tf.fit_transform(movies_df['genres'])


In [16]:
(tfidf_matrix_genres.shape)

(3883, 20)

In [17]:
cosine_sim = linear_kernel(tfidf_matrix_genres, tfidf_matrix_genres)

In [18]:
# Construct a reverse mapping of indices and movie titles
indices = pd.Series(movies_df.index, index=movies_df['title']).to_dict()

def get_recommendations(title, k):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:k]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies_df['title'].iloc[movie_indices]

In [19]:
rec_only_genres=get_recommendations('Good Will Hunting (1997)',20).head(20)

In [20]:
rec_only_genres

25                                       Othello (1995)
26                                  Now and Then (1995)
29    Shanghai Triad (Yao a yao yao dao waipo qiao) ...
30                               Dangerous Minds (1995)
35                              Dead Man Walking (1995)
39                      Cry, the Beloved Country (1995)
42                                   Restoration (1995)
52                                      Lamerica (1994)
54                                       Georgia (1995)
56                         Home for the Holidays (1995)
61                            Mr. Holland's Opus (1995)
66                                      Two Bits (1995)
77                           Crossing Guard, The (1995)
79         White Balloon, The (Badkonake Sefid ) (1995)
81                      Antonia's Line (Antonia) (1995)
82      Once Upon a Time... When We Were Colored (1995)
89                   Journey of August King, The (1995)
92                               Beautiful Girls

## TFIDF WITH TITLES AND GENRES

In [21]:
movies_df['combined_content'] = movies_df['title'] + " " + movies_df['genres']

In [22]:
tf =TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0.0, stop_words='english')

In [23]:
tfidf_matrix_combined = tf.fit_transform(movies_df['combined_content'])

In [24]:
(tfidf_matrix_combined.shape)

(3883, 12883)

In [25]:
cosine_sim = linear_kernel(tfidf_matrix_combined, tfidf_matrix_combined)

In [26]:
# Construct a reverse mapping of indices and movie titles
indices = pd.Series(movies_df.index, index=movies_df['title']).to_dict()

def get_recommendations(title, k):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:k]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies_df['title'].iloc[movie_indices]

In [27]:
rec_combined=get_recommendations('Good Will Hunting (1997)',20).head(20)

In [28]:
rec_combined

1475              All Over Me (1997)
1493                  Nowhere (1997)
1726       As Good As It Gets (1997)
2208    Somewhere in the City (1997)
2626                Boys, The (1997)
3380         Good Mother, The (1988)
1670     Sweet Hereafter, The (1997)
1677             Postman, The (1997)
1750                     Eden (1997)
2493                  Bandits (1997)
2996                Ten Benny (1997)
3378          Good Earth, The (1937)
2199          Few Good Men, A (1992)
1103                   Drunks (1997)
1417              Prefontaine (1997)
1438                 Rosewood (1997)
1474                Traveller (1997)
1567                      187 (1997)
1585             Locusts, The (1997)
Name: title, dtype: object

## The interesting thing we found is that by traininng with the title names, if you do not remove the year from the title, the recommendations will be quite similar in those terms, but if that is good or bad really depends on the overall goal of the system
### Here we can see how such factors can influence a content based model. 
### We will now try it with removing the year from the title to see how it affects our model

## TFIDF WITH CLEANED TITLES

In [29]:
movies_df['clean_title'] = movies_df['title']


In [30]:
def remove_year(title):
    if '(' in title and ')' in title:
        title = title.split('(')[0]
    return title.strip()

movies_df['clean_title'] = movies_df['clean_title'].apply(remove_year)

In [31]:
movies_df

Unnamed: 0,movie_id,title,genres,combined_content,clean_title
0,1,Toy Story (1995),Animation Children's Comedy,Toy Story (1995) Animation Children's Comedy,Toy Story
1,2,Jumanji (1995),Adventure Children's Fantasy,Jumanji (1995) Adventure Children's Fantasy,Jumanji
2,3,Grumpier Old Men (1995),Comedy Romance,Grumpier Old Men (1995) Comedy Romance,Grumpier Old Men
3,4,Waiting to Exhale (1995),Comedy Drama,Waiting to Exhale (1995) Comedy Drama,Waiting to Exhale
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II (1995) Comedy,Father of the Bride Part II
...,...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,Meet the Parents (2000) Comedy,Meet the Parents
3879,3949,Requiem for a Dream (2000),Drama,Requiem for a Dream (2000) Drama,Requiem for a Dream
3880,3950,Tigerland (2000),Drama,Tigerland (2000) Drama,Tigerland
3881,3951,Two Family House (2000),Drama,Two Family House (2000) Drama,Two Family House


In [32]:
movies_df['combined_content'] = movies_df['clean_title'] + " " + movies_df['genres']

In [33]:
movies_df

Unnamed: 0,movie_id,title,genres,combined_content,clean_title
0,1,Toy Story (1995),Animation Children's Comedy,Toy Story Animation Children's Comedy,Toy Story
1,2,Jumanji (1995),Adventure Children's Fantasy,Jumanji Adventure Children's Fantasy,Jumanji
2,3,Grumpier Old Men (1995),Comedy Romance,Grumpier Old Men Comedy Romance,Grumpier Old Men
3,4,Waiting to Exhale (1995),Comedy Drama,Waiting to Exhale Comedy Drama,Waiting to Exhale
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II Comedy,Father of the Bride Part II
...,...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,Meet the Parents Comedy,Meet the Parents
3879,3949,Requiem for a Dream (2000),Drama,Requiem for a Dream Drama,Requiem for a Dream
3880,3950,Tigerland (2000),Drama,Tigerland Drama,Tigerland
3881,3951,Two Family House (2000),Drama,Two Family House Drama,Two Family House


In [34]:
tf =TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0.0, stop_words='english')

In [35]:
tfidf_matrix_combined = tf.fit_transform(movies_df['combined_content'])

In [36]:
(tfidf_matrix_combined.shape)

(3883, 10471)

In [37]:
cosine_sim = linear_kernel(tfidf_matrix_combined, tfidf_matrix_combined)

In [38]:
# Construct a reverse mapping of indices and movie titles
indices = pd.Series(movies_df.index, index=movies_df['title']).to_dict()

def get_recommendations(title, k):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:k]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies_df['title'].iloc[movie_indices]

In [39]:
rec_combined_without_year=get_recommendations('Good Will Hunting (1997)',20).head(20)

In [40]:
rec_combined_without_year 

3378                            Good Earth, The (1937)
3380                           Good Mother, The (1988)
2199                            Few Good Men, A (1992)
1726                         As Good As It Gets (1997)
458                       Good Man in Africa, A (1994)
3379                      Good Morning, Vietnam (1987)
1183            Good, The Bad and The Ugly, The (1966)
1663    Midnight in the Garden of Good and Evil (1997)
26                                 Now and Then (1995)
382                                      S.F.W. (1994)
1126                                Get Over It (1996)
1231                                      8 1/2 (1963)
1475                                All Over Me (1997)
1493                                    Nowhere (1997)
1516                            To Have, or Not (1995)
1803                                     Go Now (1995)
1853                                   Whatever (1998)
2216                                     If.... (1968)
2864      

In [41]:
rec_combined 

1475              All Over Me (1997)
1493                  Nowhere (1997)
1726       As Good As It Gets (1997)
2208    Somewhere in the City (1997)
2626                Boys, The (1997)
3380         Good Mother, The (1988)
1670     Sweet Hereafter, The (1997)
1677             Postman, The (1997)
1750                     Eden (1997)
2493                  Bandits (1997)
2996                Ten Benny (1997)
3378          Good Earth, The (1937)
2199          Few Good Men, A (1992)
1103                   Drunks (1997)
1417              Prefontaine (1997)
1438                 Rosewood (1997)
1474                Traveller (1997)
1567                      187 (1997)
1585             Locusts, The (1997)
Name: title, dtype: object

In [42]:
rec_only_genres

25                                       Othello (1995)
26                                  Now and Then (1995)
29    Shanghai Triad (Yao a yao yao dao waipo qiao) ...
30                               Dangerous Minds (1995)
35                              Dead Man Walking (1995)
39                      Cry, the Beloved Country (1995)
42                                   Restoration (1995)
52                                      Lamerica (1994)
54                                       Georgia (1995)
56                         Home for the Holidays (1995)
61                            Mr. Holland's Opus (1995)
66                                      Two Bits (1995)
77                           Crossing Guard, The (1995)
79         White Balloon, The (Badkonake Sefid ) (1995)
81                      Antonia's Line (Antonia) (1995)
82      Once Upon a Time... When We Were Colored (1995)
89                   Journey of August King, The (1995)
92                               Beautiful Girls

# TFIDF BUT USING OTHER DISTANCE METRICS 
## Manhattan Distance, Pearsons Correrelation Coefficient

In [43]:
from sklearn.metrics import pairwise_distances

manhattan_distance = pairwise_distances(tfidf_matrix_combined, metric='manhattan')


In [44]:
manhattan_similarity = 1 / (1 + manhattan_distance)


In [45]:
manhattan_similarity

array([[1.        , 0.16903147, 0.15590055, ..., 0.1861037 , 0.16919777,
        0.1721361 ],
       [0.16903147, 1.        , 0.1581829 , ..., 0.19758144, 0.17863206,
        0.18191038],
       [0.15590055, 0.1581829 , 1.        , ..., 0.18647447, 0.16950418,
        0.17245326],
       ...,
       [0.1861037 , 0.19758144, 0.18647447, ..., 1.        , 0.22760129,
        0.23479629],
       [0.16919777, 0.17863206, 0.16950418, ..., 0.22760129, 1.        ,
        0.20705406],
       [0.1721361 , 0.18191038, 0.17245326, ..., 0.23479629, 0.20705406,
        1.        ]])

In [46]:


# Calculate the Pearson Correlation Coefficient
pearson_correlation = np.corrcoef(tfidf_matrix_combined.toarray())


In [47]:
print(pearson_correlation.shape)


(3883, 3883)


In [48]:
indices = pd.Series(movies_df.index, index=movies_df['title']).to_dict()

def get_recommendations(title, k):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores with that movie
    sim_scores = list(enumerate(manhattan_similarity[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:k]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
   
    # Return the top 10 most similar movies
    return movies_df['title'].iloc[movie_indices]

In [49]:
rec_combined_without_year_manhattan=get_recommendations('Good Will Hunting (1997)',20).head(20)

In [50]:
indices = pd.Series(movies_df.index, index=movies_df['title']).to_dict()

def get_recommendations(title, k):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores with that movie
    sim_scores = list(enumerate(pearson_correlation[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:k]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
   
    # Return the top 10 most similar movies
    return movies_df['title'].iloc[movie_indices]

In [51]:
rec_combined_without_year_pearsons=get_recommendations('Good Will Hunting (1997)',20).head(20)

In [52]:
rec_combined_without_year_manhattan

26                               Now and Then (1995)
382                                    S.F.W. (1994)
1126                              Get Over It (1996)
1231                                    8 1/2 (1963)
1475                              All Over Me (1997)
1493                                  Nowhere (1997)
1516                          To Have, or Not (1995)
1803                                   Go Now (1995)
1853                                 Whatever (1998)
2216                                   If.... (1968)
2864         Fire Within, The (Le Feu Follet) (1963)
2982                        Anywhere But Here (1999)
3220    Not One Less (Yi ge dou bu neng shao) (1999)
190                                 Show, The (1995)
1272                              Being There (1979)
1460                                  B*A*P*S (1997)
1572                                 In & Out (1997)
1865               You Can't Take It With You (1938)
2511                                       Go 

In [53]:
rec_combined_without_year_pearsons

3378                            Good Earth, The (1937)
3380                           Good Mother, The (1988)
2199                            Few Good Men, A (1992)
1726                         As Good As It Gets (1997)
458                       Good Man in Africa, A (1994)
3379                      Good Morning, Vietnam (1987)
1183            Good, The Bad and The Ugly, The (1966)
1663    Midnight in the Garden of Good and Evil (1997)
26                                 Now and Then (1995)
382                                      S.F.W. (1994)
1126                                Get Over It (1996)
1231                                      8 1/2 (1963)
1475                                All Over Me (1997)
1493                                    Nowhere (1997)
1516                            To Have, or Not (1995)
1803                                     Go Now (1995)
1853                                   Whatever (1998)
2216                                     If.... (1968)
2864      

In [54]:
rec_combined_without_year

3378                            Good Earth, The (1937)
3380                           Good Mother, The (1988)
2199                            Few Good Men, A (1992)
1726                         As Good As It Gets (1997)
458                       Good Man in Africa, A (1994)
3379                      Good Morning, Vietnam (1987)
1183            Good, The Bad and The Ugly, The (1966)
1663    Midnight in the Garden of Good and Evil (1997)
26                                 Now and Then (1995)
382                                      S.F.W. (1994)
1126                                Get Over It (1996)
1231                                      8 1/2 (1963)
1475                                All Over Me (1997)
1493                                    Nowhere (1997)
1516                            To Have, or Not (1995)
1803                                     Go Now (1995)
1853                                   Whatever (1998)
2216                                     If.... (1968)
2864      

In [58]:
!pip install gensim

Defaulting to user installation because normal site-packages is not writeable


# WORD2VEC

In [56]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Convert titles to lowercase and tokenize
movies_df['tokens'] = movies_df['title'].str.lower().apply(word_tokenize)


[nltk_data] Downloading package punkt to
[nltk_data]     /home/FYP/nipun001/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [57]:
movies_df

Unnamed: 0,movie_id,title,genres,combined_content,clean_title,tokens
0,1,Toy Story (1995),Animation Children's Comedy,Toy Story Animation Children's Comedy,Toy Story,"[toy, story, (, 1995, )]"
1,2,Jumanji (1995),Adventure Children's Fantasy,Jumanji Adventure Children's Fantasy,Jumanji,"[jumanji, (, 1995, )]"
2,3,Grumpier Old Men (1995),Comedy Romance,Grumpier Old Men Comedy Romance,Grumpier Old Men,"[grumpier, old, men, (, 1995, )]"
3,4,Waiting to Exhale (1995),Comedy Drama,Waiting to Exhale Comedy Drama,Waiting to Exhale,"[waiting, to, exhale, (, 1995, )]"
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II Comedy,Father of the Bride Part II,"[father, of, the, bride, part, ii, (, 1995, )]"
...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,Meet the Parents Comedy,Meet the Parents,"[meet, the, parents, (, 2000, )]"
3879,3949,Requiem for a Dream (2000),Drama,Requiem for a Dream Drama,Requiem for a Dream,"[requiem, for, a, dream, (, 2000, )]"
3880,3950,Tigerland (2000),Drama,Tigerland Drama,Tigerland,"[tigerland, (, 2000, )]"
3881,3951,Two Family House (2000),Drama,Two Family House Drama,Two Family House,"[two, family, house, (, 2000, )]"


In [60]:
from gensim.models import Word2Vec

# Train the model
model = Word2Vec(sentences=movies_df['tokens'], vector_size=50, window=5, min_count=1)


In [62]:
import numpy as np

def get_title_embedding(title_tokens):
    embeddings = [model.wv[token] for token in title_tokens if token in model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

movies_df['title_embedding'] = movies_df['tokens'].apply(get_title_embedding)


In [64]:
movies_df['title_embedding']

0       [-0.19349521, 0.07656503, 0.14460446, 0.217834...
1       [-0.21410955, 0.09006852, 0.16089745, 0.247330...
2       [-0.16789703, 0.07021295, 0.12970188, 0.189012...
3       [-0.18849675, 0.082981825, 0.15280975, 0.21241...
4       [-0.19509387, 0.07866987, 0.15126096, 0.225234...
                              ...                        
3878    [-0.18108411, 0.074163504, 0.14572096, 0.21591...
3879    [-0.18508492, 0.07660105, 0.14508858, 0.223769...
3880    [-0.20049207, 0.0803373, 0.1603632, 0.23768824...
3881    [-0.15904482, 0.06748768, 0.12800017, 0.188982...
3882    [-0.20987873, 0.08468004, 0.16937529, 0.251700...
Name: title_embedding, Length: 3883, dtype: object

In [68]:
from sklearn.preprocessing import MultiLabelBinarizer

# Split genres by "|" and encode
mlb = MultiLabelBinarizer()
movies_df['genres_list'] = movies_df['genres'].str.split('|')
genre_encoded = mlb.fit_transform(movies_df['genres_list'])
genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)


In [69]:
from sklearn.preprocessing import MinMaxScaler

# Scale title embeddings
scaler = MinMaxScaler()
scaled_embeddings = scaler.fit_transform(movies_df['title_embedding'].tolist())


In [70]:
combined_features = np.hstack((scaled_embeddings, genre_df.values))


In [75]:

# Compute cosine similarities
similarities = cosine_similarity(combined_features)


In [79]:

def get_combined_recommendations(title, num_recommendations=10):
    idx = movies_df[movies_df['title'] == title].index[0]
    sim_scores = list(enumerate(similarities[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]  # Exclude the input movie itself
    movie_indices = [i[0] for i in sim_scores]
    return movies_df['title'].iloc[movie_indices]




In [82]:
print(get_combined_recommendations('Good Will Hunting (1997)',20))

911                                   Citizen Kane (1941)
306             Red Firecracker, Green Firecracker (1994)
1157      Cook the Thief His Wife & Her Lover, The (1989)
3508                              Two Moon Juction (1988)
2411               Dry Cleaning (Nettoyage Ã  sec) (1997)
1777                                  Nil By Mouth (1997)
3576            Cleo From 5 to 7 (ClÃ©o de 5 Ã  7) (1962)
2656                              Twin Falls Idaho (1999)
684       Under the Domin Tree (Etz Hadomim Tafus) (1994)
2661                                  Barry Lyndon (1975)
2643                                Eyes Wide Shut (1999)
3513                    Jails, Hospitals & Hip-Hop (2000)
2328                                   Mass Appeal (1984)
932                                   Lost Horizon (1937)
3116                        Snow Falling on Cedars (1999)
783     My Life and Times With Antonin Artaud (En comp...
2662       400 Blows, The (Les Quatre cents coups) (1959)
1735          