In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [4]:

import numpy as np
from sklearn.metrics import pairwise_distances

In [5]:
import pandas as pd
movies_df = pd.read_csv('movies.csv', sep='\t', encoding='latin-1', usecols=['movie_id', 'title', 'genres'])

In [6]:
movies_df.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
all_genres = movies_df['genres'].str.split('|').explode()

# Get unique genres from the list
unique_genres = all_genres.unique()

# Count the number of unique genres
num_unique_genres = len(unique_genres)
num_unique_genres, unique_genres

(18,
 array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
        'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
        'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
        'Western'], dtype=object))

In [8]:
movies_df['genres'] = movies_df['genres'].str.replace('|', ' ')
movies_df

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation Children's Comedy
1,2,Jumanji (1995),Adventure Children's Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [9]:
movies_df['genres'].fillna("").astype('str')

0        Animation Children's Comedy
1       Adventure Children's Fantasy
2                     Comedy Romance
3                       Comedy Drama
4                             Comedy
                    ...             
3878                          Comedy
3879                           Drama
3880                           Drama
3881                           Drama
3882                  Drama Thriller
Name: genres, Length: 3883, dtype: object

In [10]:
ratings_df = pd.read_csv('ratings.csv', sep='\t', encoding='latin-1')

In [11]:
ratings_df

Unnamed: 0.1,Unnamed: 0,user_id,movie_id,rating,timestamp,user_emb_id,movie_emb_id
0,0,1,1193,5,978300760,0,1192
1,1,1,661,3,978302109,0,660
2,2,1,914,3,978301968,0,913
3,3,1,3408,4,978300275,0,3407
4,4,1,2355,5,978824291,0,2354
...,...,...,...,...,...,...,...
1000204,1000204,6040,1091,1,956716541,6039,1090
1000205,1000205,6040,1094,5,956704887,6039,1093
1000206,1000206,6040,562,5,956704746,6039,561
1000207,1000207,6040,1096,4,956715648,6039,1095


# 1) Changing input data for TFIDF

## TFIDF TRAINED ON ONLY MOVIE GENRE

In [12]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 1),min_df=0.0)

In [13]:
tfidf_matrix_genres = tf.fit_transform(movies_df['genres'])


In [14]:
(tfidf_matrix_genres.shape)

(3883, 20)

In [15]:
cosine_sim = linear_kernel(tfidf_matrix_genres, tfidf_matrix_genres)

In [16]:
# Construct a reverse mapping of indices and movie titles
indices = pd.Series(movies_df.index, index=movies_df['title']).to_dict()

def get_recommendations(title, k):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:k]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies_df['title'].iloc[movie_indices]

In [17]:
rec_only_genres=get_recommendations('Good Will Hunting (1997)',20).head(20)

In [18]:
rec_only_genres

25                                       Othello (1995)
26                                  Now and Then (1995)
29    Shanghai Triad (Yao a yao yao dao waipo qiao) ...
30                               Dangerous Minds (1995)
35                              Dead Man Walking (1995)
39                      Cry, the Beloved Country (1995)
42                                   Restoration (1995)
52                                      Lamerica (1994)
54                                       Georgia (1995)
56                         Home for the Holidays (1995)
61                            Mr. Holland's Opus (1995)
66                                      Two Bits (1995)
77                           Crossing Guard, The (1995)
79         White Balloon, The (Badkonake Sefid ) (1995)
81                      Antonia's Line (Antonia) (1995)
82      Once Upon a Time... When We Were Colored (1995)
89                   Journey of August King, The (1995)
92                               Beautiful Girls

## TFIDF WITH TITLES AND GENRES

In [19]:
movies_df['combined_content'] = movies_df['title'] + " " + movies_df['genres']

In [20]:
tf =TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0.0, stop_words='english')

In [21]:
tfidf_matrix_combined = tf.fit_transform(movies_df['combined_content'])

In [22]:
(tfidf_matrix_combined.shape)

(3883, 12883)

In [23]:
cosine_sim = linear_kernel(tfidf_matrix_combined, tfidf_matrix_combined)

In [24]:
# Construct a reverse mapping of indices and movie titles
indices = pd.Series(movies_df.index, index=movies_df['title']).to_dict()

def get_recommendations(title, k):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:k]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies_df['title'].iloc[movie_indices]

In [25]:
rec_combined=get_recommendations('Good Will Hunting (1997)',20).head(20)

In [26]:
rec_combined

1475              All Over Me (1997)
1493                  Nowhere (1997)
1726       As Good As It Gets (1997)
2208    Somewhere in the City (1997)
2626                Boys, The (1997)
3380         Good Mother, The (1988)
1670     Sweet Hereafter, The (1997)
1677             Postman, The (1997)
1750                     Eden (1997)
2493                  Bandits (1997)
2996                Ten Benny (1997)
3378          Good Earth, The (1937)
2199          Few Good Men, A (1992)
1103                   Drunks (1997)
1417              Prefontaine (1997)
1438                 Rosewood (1997)
1474                Traveller (1997)
1567                      187 (1997)
1585             Locusts, The (1997)
Name: title, dtype: object

## The interesting thing we found is that by traininng with the title names, if you do not remove the year from the title, the recommendations will be quite similar in those terms, but if that is good or bad really depends on the overall goal of the system
### Here we can see how such factors can influence a content based model. 
### We will now try it with removing the year from the title to see how it affects our model

## TFIDF WITH CLEANED TITLES

In [27]:
movies_df['clean_title'] = movies_df['title']


In [28]:
def remove_year(title):
    if '(' in title and ')' in title:
        title = title.split('(')[0]
    return title.strip()

movies_df['clean_title'] = movies_df['clean_title'].apply(remove_year)

In [29]:
movies_df

Unnamed: 0,movie_id,title,genres,combined_content,clean_title
0,1,Toy Story (1995),Animation Children's Comedy,Toy Story (1995) Animation Children's Comedy,Toy Story
1,2,Jumanji (1995),Adventure Children's Fantasy,Jumanji (1995) Adventure Children's Fantasy,Jumanji
2,3,Grumpier Old Men (1995),Comedy Romance,Grumpier Old Men (1995) Comedy Romance,Grumpier Old Men
3,4,Waiting to Exhale (1995),Comedy Drama,Waiting to Exhale (1995) Comedy Drama,Waiting to Exhale
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II (1995) Comedy,Father of the Bride Part II
...,...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,Meet the Parents (2000) Comedy,Meet the Parents
3879,3949,Requiem for a Dream (2000),Drama,Requiem for a Dream (2000) Drama,Requiem for a Dream
3880,3950,Tigerland (2000),Drama,Tigerland (2000) Drama,Tigerland
3881,3951,Two Family House (2000),Drama,Two Family House (2000) Drama,Two Family House


In [30]:
movies_df['combined_content'] = movies_df['clean_title'] + " " + movies_df['genres']

In [31]:
movies_df

Unnamed: 0,movie_id,title,genres,combined_content,clean_title
0,1,Toy Story (1995),Animation Children's Comedy,Toy Story Animation Children's Comedy,Toy Story
1,2,Jumanji (1995),Adventure Children's Fantasy,Jumanji Adventure Children's Fantasy,Jumanji
2,3,Grumpier Old Men (1995),Comedy Romance,Grumpier Old Men Comedy Romance,Grumpier Old Men
3,4,Waiting to Exhale (1995),Comedy Drama,Waiting to Exhale Comedy Drama,Waiting to Exhale
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II Comedy,Father of the Bride Part II
...,...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,Meet the Parents Comedy,Meet the Parents
3879,3949,Requiem for a Dream (2000),Drama,Requiem for a Dream Drama,Requiem for a Dream
3880,3950,Tigerland (2000),Drama,Tigerland Drama,Tigerland
3881,3951,Two Family House (2000),Drama,Two Family House Drama,Two Family House


In [32]:
tf =TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0.0, stop_words='english')

In [33]:
tfidf_matrix_combined = tf.fit_transform(movies_df['combined_content'])

In [34]:
(tfidf_matrix_combined.shape)

(3883, 10471)

In [35]:
cosine_sim = linear_kernel(tfidf_matrix_combined, tfidf_matrix_combined)

In [36]:
# Construct a reverse mapping of indices and movie titles
indices = pd.Series(movies_df.index, index=movies_df['title']).to_dict()

def get_recommendations(title, k):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:k]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies_df['title'].iloc[movie_indices]

In [37]:
rec_combined_without_year=get_recommendations('Good Will Hunting (1997)',20).head(20)

In [38]:
rec_combined_without_year 

3378                            Good Earth, The (1937)
3380                           Good Mother, The (1988)
2199                            Few Good Men, A (1992)
1726                         As Good As It Gets (1997)
458                       Good Man in Africa, A (1994)
3379                      Good Morning, Vietnam (1987)
1183            Good, The Bad and The Ugly, The (1966)
1663    Midnight in the Garden of Good and Evil (1997)
26                                 Now and Then (1995)
382                                      S.F.W. (1994)
1126                                Get Over It (1996)
1231                                      8 1/2 (1963)
1475                                All Over Me (1997)
1493                                    Nowhere (1997)
1516                            To Have, or Not (1995)
1803                                     Go Now (1995)
1853                                   Whatever (1998)
2216                                     If.... (1968)
2864      

In [39]:
rec_combined 

1475              All Over Me (1997)
1493                  Nowhere (1997)
1726       As Good As It Gets (1997)
2208    Somewhere in the City (1997)
2626                Boys, The (1997)
3380         Good Mother, The (1988)
1670     Sweet Hereafter, The (1997)
1677             Postman, The (1997)
1750                     Eden (1997)
2493                  Bandits (1997)
2996                Ten Benny (1997)
3378          Good Earth, The (1937)
2199          Few Good Men, A (1992)
1103                   Drunks (1997)
1417              Prefontaine (1997)
1438                 Rosewood (1997)
1474                Traveller (1997)
1567                      187 (1997)
1585             Locusts, The (1997)
Name: title, dtype: object

In [40]:
rec_only_genres

25                                       Othello (1995)
26                                  Now and Then (1995)
29    Shanghai Triad (Yao a yao yao dao waipo qiao) ...
30                               Dangerous Minds (1995)
35                              Dead Man Walking (1995)
39                      Cry, the Beloved Country (1995)
42                                   Restoration (1995)
52                                      Lamerica (1994)
54                                       Georgia (1995)
56                         Home for the Holidays (1995)
61                            Mr. Holland's Opus (1995)
66                                      Two Bits (1995)
77                           Crossing Guard, The (1995)
79         White Balloon, The (Badkonake Sefid ) (1995)
81                      Antonia's Line (Antonia) (1995)
82      Once Upon a Time... When We Were Colored (1995)
89                   Journey of August King, The (1995)
92                               Beautiful Girls

# TFIDF BUT USING OTHER DISTANCE METRICS 
## Manhattan Distance, Pearsons Correrelation Coefficient

In [41]:
from sklearn.metrics import pairwise_distances

manhattan_distance = pairwise_distances(tfidf_matrix_combined, metric='manhattan')


In [42]:
manhattan_similarity = 1 / (1 + manhattan_distance)


In [43]:
manhattan_similarity

array([[1.        , 0.16903147, 0.15590055, ..., 0.1861037 , 0.16919777,
        0.1721361 ],
       [0.16903147, 1.        , 0.1581829 , ..., 0.19758144, 0.17863206,
        0.18191038],
       [0.15590055, 0.1581829 , 1.        , ..., 0.18647447, 0.16950418,
        0.17245326],
       ...,
       [0.1861037 , 0.19758144, 0.18647447, ..., 1.        , 0.22760129,
        0.23479629],
       [0.16919777, 0.17863206, 0.16950418, ..., 0.22760129, 1.        ,
        0.20705406],
       [0.1721361 , 0.18191038, 0.17245326, ..., 0.23479629, 0.20705406,
        1.        ]])

In [44]:


# Calculate the Pearson Correlation Coefficient
pearson_correlation = np.corrcoef(tfidf_matrix_combined.toarray())


In [45]:
print(pearson_correlation.shape)


(3883, 3883)


In [46]:
indices = pd.Series(movies_df.index, index=movies_df['title']).to_dict()

def get_recommendations(title, k):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores with that movie
    sim_scores = list(enumerate(manhattan_similarity[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:k]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
   
    # Return the top 10 most similar movies
    return movies_df['title'].iloc[movie_indices]

In [47]:
rec_combined_without_year_manhattan=get_recommendations('Good Will Hunting (1997)',20).head(20)

In [48]:
indices = pd.Series(movies_df.index, index=movies_df['title']).to_dict()

def get_recommendations(title, k):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwise similarity scores with that movie
    sim_scores = list(enumerate(pearson_correlation[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:k]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
   
    # Return the top 10 most similar movies
    return movies_df['title'].iloc[movie_indices]

In [49]:
rec_combined_without_year_pearsons=get_recommendations('Good Will Hunting (1997)',20).head(20)

In [63]:
rec_combined_without_year_manhattan

26                               Now and Then (1995)
382                                    S.F.W. (1994)
1126                              Get Over It (1996)
1231                                    8 1/2 (1963)
1475                              All Over Me (1997)
1493                                  Nowhere (1997)
1516                          To Have, or Not (1995)
1803                                   Go Now (1995)
1853                                 Whatever (1998)
2216                                   If.... (1968)
2864         Fire Within, The (Le Feu Follet) (1963)
2982                        Anywhere But Here (1999)
3220    Not One Less (Yi ge dou bu neng shao) (1999)
190                                 Show, The (1995)
1272                              Being There (1979)
1460                                  B*A*P*S (1997)
1572                                 In & Out (1997)
1865               You Can't Take It With You (1938)
2511                                       Go 

In [51]:
rec_combined_without_year_pearsons

3378                            Good Earth, The (1937)
3380                           Good Mother, The (1988)
2199                            Few Good Men, A (1992)
1726                         As Good As It Gets (1997)
458                       Good Man in Africa, A (1994)
3379                      Good Morning, Vietnam (1987)
1183            Good, The Bad and The Ugly, The (1966)
1663    Midnight in the Garden of Good and Evil (1997)
26                                 Now and Then (1995)
382                                      S.F.W. (1994)
1126                                Get Over It (1996)
1231                                      8 1/2 (1963)
1475                                All Over Me (1997)
1493                                    Nowhere (1997)
1516                            To Have, or Not (1995)
1803                                     Go Now (1995)
1853                                   Whatever (1998)
2216                                     If.... (1968)
2864      

In [52]:
rec_combined_without_year

3378                            Good Earth, The (1937)
3380                           Good Mother, The (1988)
2199                            Few Good Men, A (1992)
1726                         As Good As It Gets (1997)
458                       Good Man in Africa, A (1994)
3379                      Good Morning, Vietnam (1987)
1183            Good, The Bad and The Ugly, The (1966)
1663    Midnight in the Garden of Good and Evil (1997)
26                                 Now and Then (1995)
382                                      S.F.W. (1994)
1126                                Get Over It (1996)
1231                                      8 1/2 (1963)
1475                                All Over Me (1997)
1493                                    Nowhere (1997)
1516                            To Have, or Not (1995)
1803                                     Go Now (1995)
1853                                   Whatever (1998)
2216                                     If.... (1968)
2864      

In [53]:
!pip install gensim

Defaulting to user installation because normal site-packages is not writeable


# WORD2VEC

In [54]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Convert titles to lowercase and tokenize
movies_df['tokens'] = movies_df['title'].str.lower().apply(word_tokenize)


[nltk_data] Downloading package punkt to
[nltk_data]     /home/FYP/nipun001/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [55]:
movies_df

Unnamed: 0,movie_id,title,genres,combined_content,clean_title,tokens
0,1,Toy Story (1995),Animation Children's Comedy,Toy Story Animation Children's Comedy,Toy Story,"[toy, story, (, 1995, )]"
1,2,Jumanji (1995),Adventure Children's Fantasy,Jumanji Adventure Children's Fantasy,Jumanji,"[jumanji, (, 1995, )]"
2,3,Grumpier Old Men (1995),Comedy Romance,Grumpier Old Men Comedy Romance,Grumpier Old Men,"[grumpier, old, men, (, 1995, )]"
3,4,Waiting to Exhale (1995),Comedy Drama,Waiting to Exhale Comedy Drama,Waiting to Exhale,"[waiting, to, exhale, (, 1995, )]"
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II Comedy,Father of the Bride Part II,"[father, of, the, bride, part, ii, (, 1995, )]"
...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,Meet the Parents Comedy,Meet the Parents,"[meet, the, parents, (, 2000, )]"
3879,3949,Requiem for a Dream (2000),Drama,Requiem for a Dream Drama,Requiem for a Dream,"[requiem, for, a, dream, (, 2000, )]"
3880,3950,Tigerland (2000),Drama,Tigerland Drama,Tigerland,"[tigerland, (, 2000, )]"
3881,3951,Two Family House (2000),Drama,Two Family House Drama,Two Family House,"[two, family, house, (, 2000, )]"


In [98]:
from gensim.models import Word2Vec

# Train the model
model = Word2Vec(sentences=movies_df['tokens'], vector_size=50, window=5, min_count=1)


In [99]:
import numpy as np

def get_title_embedding(title_tokens):
    embeddings = [model.wv[token] for token in title_tokens if token in model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

movies_df['title_embedding'] = movies_df['tokens'].apply(get_title_embedding)


In [100]:
movies_df['title_embedding']

0       [-0.35110056, 0.20834465, -0.11387561, -0.0544...
1       [-0.39149696, 0.23860687, -0.1231528, -0.05967...
2       [-0.31545684, 0.18669872, -0.10016296, -0.0495...
3       [-0.35059738, 0.2144134, -0.107361175, -0.0606...
4       [-0.3623083, 0.21314543, -0.11584376, -0.06079...
                              ...                        
3878    [-0.3444371, 0.20149797, -0.103228986, -0.0489...
3879    [-0.34421477, 0.20708129, -0.10991713, -0.0513...
3880    [-0.38391754, 0.22588325, -0.121586286, -0.056...
3881    [-0.31579086, 0.19382073, -0.10466689, -0.0472...
3882    [-0.40558735, 0.23630808, -0.12762702, -0.0607...
Name: title_embedding, Length: 3883, dtype: object

In [101]:
from sklearn.preprocessing import MultiLabelBinarizer

# Split genres by "|" and encode
mlb = MultiLabelBinarizer()
movies_df['genres_list'] = movies_df['genres'].str.split('|')
genre_encoded = mlb.fit_transform(movies_df['genres_list'])
genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)


In [102]:
from sklearn.preprocessing import MinMaxScaler

# Scale title embeddings
scaler = MinMaxScaler()
scaled_embeddings = scaler.fit_transform(movies_df['title_embedding'].tolist())


In [103]:
combined_features = np.hstack((scaled_embeddings, genre_df.values))


In [104]:

# Compute cosine similarities
similarities = linear_kernel(combined_features)


In [105]:
len(similarities[0])

3883

In [65]:

def get_combined_recommendations(title, num_recommendations=10):
    idx = movies_df[movies_df['title'] == title].index[0]
    sim_scores = list(enumerate(similarities[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]  # Exclude the input movie itself
    movie_indices = [i[0] for i in sim_scores]
    return movies_df['title'].iloc[movie_indices]




In [66]:
print(get_combined_recommendations('Good Will Hunting (1997)',20))

2978     Experience Preferred... But Not Essential (1982)
2778                        Only Angels Have Wings (1939)
720     Institute Benjamenta, or This Dream People Cal...
1886                             Kramer Vs. Kramer (1979)
82        Once Upon a Time... When We Were Colored (1995)
2161                         Always Tell Your Wife (1923)
2170    Swept Away (Travolti da un insolito destino ne...
3220         Not One Less (Yi ge dou bu neng shao) (1999)
949                         Little Lord Fauntleroy (1936)
2636    Late August, Early September (Fin aoÃ»t, dÃ©bu...
648                      Und keiner weint mir nach (1996)
2028               Something Wicked This Way Comes (1983)
3063                               Daddy Long Legs (1919)
1836                          Marie Baie Des Anges (1997)
2095                           Surf Nazis Must Die (1987)
1148          Two or Three Things I Know About Her (1966)
961                                  Meet John Doe (1941)
1866          

In [67]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Convert titles to lowercase and tokenize
movies_df['tokens'] = movies_df['title'].str.lower().apply(word_tokenize)


[nltk_data] Downloading package punkt to
[nltk_data]     /home/FYP/nipun001/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [68]:
movies_df

Unnamed: 0,movie_id,title,genres,combined_content,clean_title,tokens,title_embedding,genres_list
0,1,Toy Story (1995),Animation Children's Comedy,Toy Story Animation Children's Comedy,Toy Story,"[toy, story, (, 1995, )]","[-0.35262355, 0.21018216, -0.11309519, -0.0541...",[Animation Children's Comedy]
1,2,Jumanji (1995),Adventure Children's Fantasy,Jumanji Adventure Children's Fantasy,Jumanji,"[jumanji, (, 1995, )]","[-0.39333522, 0.2407487, -0.12231919, -0.05939...",[Adventure Children's Fantasy]
2,3,Grumpier Old Men (1995),Comedy Romance,Grumpier Old Men Comedy Romance,Grumpier Old Men,"[grumpier, old, men, (, 1995, )]","[-0.3168321, 0.18835324, -0.09946834, -0.04933...",[Comedy Romance]
3,4,Waiting to Exhale (1995),Comedy Drama,Waiting to Exhale Comedy Drama,Waiting to Exhale,"[waiting, to, exhale, (, 1995, )]","[-0.35216215, 0.21629919, -0.10654348, -0.0603...",[Comedy Drama]
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II Comedy,Father of the Bride Part II,"[father, of, the, bride, part, ii, (, 1995, )]","[-0.3637832, 0.21499942, -0.11500202, -0.06050...",[Comedy]
...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,Meet the Parents Comedy,Meet the Parents,"[meet, the, parents, (, 2000, )]","[-0.34609175, 0.20340843, -0.102523856, -0.048...",[Comedy]
3879,3949,Requiem for a Dream (2000),Drama,Requiem for a Dream Drama,Requiem for a Dream,"[requiem, for, a, dream, (, 2000, )]","[-0.34573635, 0.20892248, -0.10909706, -0.0510...",[Drama]
3880,3950,Tigerland (2000),Drama,Tigerland Drama,Tigerland,"[tigerland, (, 2000, )]","[-0.38561624, 0.22790691, -0.12071073, -0.0557...",[Drama]
3881,3951,Two Family House (2000),Drama,Two Family House Drama,Two Family House,"[two, family, house, (, 2000, )]","[-0.31723642, 0.19551755, -0.10393145, -0.0470...",[Drama]


In [69]:
from gensim.models import Word2Vec

# Train the model
model = Word2Vec(sentences=movies_df['tokens'], vector_size=50, window=5, min_count=1)


In [70]:
import numpy as np

def get_title_embedding(title_tokens):
    embeddings = [model.wv[token] for token in title_tokens if token in model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

movies_df['title_embedding'] = movies_df['tokens'].apply(get_title_embedding)


In [71]:
movies_df['title_embedding']

0       [-0.3586381, 0.21743126, -0.11372642, -0.05476...
1       [-0.3998636, 0.24879414, -0.123040065, -0.0599...
2       [-0.3217158, 0.19448669, -0.09997342, -0.04975...
3       [-0.35751662, 0.22319867, -0.10698814, -0.0607...
4       [-0.3698722, 0.22245795, -0.11583826, -0.06089...
                              ...                        
3878    [-0.3515142, 0.21021126, -0.103111684, -0.0491...
3879    [-0.35147074, 0.21595035, -0.109595545, -0.051...
3880    [-0.39140192, 0.23522708, -0.12105024, -0.0562...
3881    [-0.3218303, 0.20148814, -0.10415309, -0.04745...
3882    [-0.41413784, 0.24668558, -0.12736605, -0.0609...
Name: title_embedding, Length: 3883, dtype: object

In [72]:
from sklearn.preprocessing import MultiLabelBinarizer

# Split genres by "|" and encode
mlb = MultiLabelBinarizer()
movies_df['genres_list'] = movies_df['genres'].str.split('|')
genre_encoded = mlb.fit_transform(movies_df['genres_list'])
genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)


In [73]:
from sklearn.preprocessing import MinMaxScaler

# Scale title embeddings
scaler = MinMaxScaler()
scaled_embeddings = scaler.fit_transform(movies_df['title_embedding'].tolist())


In [74]:
combined_features = np.hstack((scaled_embeddings, genre_df.values))


In [77]:

# Compute cosine similarities
similarities = linear_kernel(combined_features)


In [78]:

def get_combined_recommendations(title, num_recommendations=10):
    idx = movies_df[movies_df['title'] == title].index[0]
    sim_scores = list(enumerate(similarities[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:num_recommendations+1]  # Exclude the input movie itself
    movie_indices = [i[0] for i in sim_scores]
    return movies_df['title'].iloc[movie_indices]




In [79]:
print(get_combined_recommendations('Good Will Hunting (1997)',20))

2978     Experience Preferred... But Not Essential (1982)
2778                        Only Angels Have Wings (1939)
720     Institute Benjamenta, or This Dream People Cal...
1886                             Kramer Vs. Kramer (1979)
82        Once Upon a Time... When We Were Colored (1995)
2161                         Always Tell Your Wife (1923)
3220         Not One Less (Yi ge dou bu neng shao) (1999)
2170    Swept Away (Travolti da un insolito destino ne...
949                         Little Lord Fauntleroy (1936)
648                      Und keiner weint mir nach (1996)
2636    Late August, Early September (Fin aoÃ»t, dÃ©bu...
2028               Something Wicked This Way Comes (1983)
1836                          Marie Baie Des Anges (1997)
3063                               Daddy Long Legs (1919)
2095                           Surf Nazis Must Die (1987)
1148          Two or Three Things I Know About Her (1966)
1866                       How Green Was My Valley (1941)
961           

# Evaluation

In [82]:
# Load the user movie data

movie_data = pd.read_csv('movie_f.csv')
movie_data.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,2355,5
2,1,1287,5
3,1,2804,5
4,1,595,5


In [83]:
# Compute the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(list(tfidf_vectors.values()))


NameError: name 'cosine_similarity' is not defined

In [86]:
# # Adapted get_recommendations function
# def get_user_recommendations(user_profile, k=10):
#     # Compute cosine similarity between the user profile and all movie vectors
#     sim_scores = cosine_similarity([user_profile], list(tfidf_vectors.values())).flatten()
    
#     # Get the indices of the top k movies
#     movie_indices = sim_scores.argsort()[-k:][::-1]
    
#     # Convert indices to movie IDs (for the sake of this example, we're using a hypothetical mapping)
#     recommended_movie_ids = [f"movie_id_{idx+1}" for idx in movie_indices]
    
#     return recommended_movie_ids

# # Example usage:
# # For a hypothetical user who liked movie_id_1 and movie_id_2
# user_liked_movies = ['movie_id_1', 'movie_id_2']
# user_profile = np.mean([tfidf_vectors[movie_id] for movie_id in user_liked_movies], axis=0)

# recommendations = get_user_recommendations(user_profile, k=5)
# recommendations

In [90]:
print(tfidf_matrix_combined.shape)


(3883, 10471)


In [91]:
movie_id_to_row_index = {movie_id: index for index, movie_id in enumerate(movie_data['movie_id'].unique())}


In [92]:
# Convert liked_movie_ids to liked_movie_indices using the mapping
liked_movie_indices = [movie_id_to_row_index[movie_id] for movie_id in liked_movie_ids]


NameError: name 'liked_movie_ids' is not defined

In [93]:
def get_user_recommendations(user_id, k=10):
    # Create a mapping from movie_id to row index in the matrix
    movie_id_to_row_index = {movie_id: index for index, movie_id in enumerate(movie_data['movie_id'].unique())}
    
    # Fetch movies liked or rated highly by the user
    liked_movies = movie_data[movie_data['user_id'] == user_id]
    
    # For simplicity, let's consider movies rated 5 as liked movies
    liked_movie_ids = liked_movies[liked_movies['rating'] == 5]['movie_id'].tolist()

    # Convert liked_movie_ids to liked_movie_indices using the mapping, 
    # and filter out any indices that are out of bounds
    liked_movie_indices = [movie_id_to_row_index[movie_id] for movie_id in liked_movie_ids if movie_id_to_row_index.get(movie_id, None) is not None and movie_id_to_row_index[movie_id] < tfidf_matrix_combined.shape[0]]
    
    # If no liked movies found in the bounds of the matrix, return an empty list or handle this scenario accordingly
    if not liked_movie_indices:
        return []

    # Create a user profile by averaging the TF-IDF vectors of movies they liked
    # Note: We reshape the user profile here.
    user_profile = np.mean(np.vstack([tfidf_matrix_combined[i].toarray() for i in liked_movie_indices]), axis=0)

    # Compute cosine similarity between the user profile and all movie vectors
    sim_scores = linear_kernel([user_profile], tfidf_matrix_combined).flatten()
    
    # Get the indices of the top k movies
    movie_indices = sim_scores.argsort()[-k:][::-1]
    
    return movie_indices


In [94]:
from sklearn.metrics.pairwise import linear_kernel

# Assuming tfidf_matrix_combined contains TF-IDF vectors for each movie
# indexed by the movie's position in the dataset.

# Assuming you've already computed:
# cosine_sim = linear_kernel(tfidf_matrix_combined, tfidf_matrix_combined)



def precision_at_k(user_id, k=10):
    # Get the top k recommendations for the user
    recommended_movie_indices = get_user_recommendations(user_id, k)

    # Fetch movies liked or rated highly by the user
    liked_movies = movie_data[movie_data['user_id'] == user_id]
    liked_movie_indices = liked_movies[liked_movies['rating'] == 5]['movie_id'].tolist()

    # Calculate how many of the top k recommended movies were actually liked by the user
    relevant_recommendations = len(set(recommended_movie_indices) & set(liked_movie_indices))

    return relevant_recommendations / k

# Compute average P@k for all users
all_users = movie_data['user_id'].unique()
average_P_at_k = np.mean([precision_at_k(user_id) for user_id in all_users])




Average Precision at k: 0.00566225165562914


In [126]:
print(f"Average Precision at k=10: 0.00566225165562914")

Average Precision at k=10: 0.00566225165562914


In [112]:

word2vec_matrix=similarities

In [110]:
def get_movie_title_from_id(movie_id):
    return movies_df[movies_df['movie_id'] == movie_id]['title'].iloc[0]



In [121]:
movie_data

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,2355,5
2,1,1287,5
3,1,2804,5
4,1,595,5
...,...,...,...
226565,6040,1077,5
226566,6040,2022,5
226567,6040,2028,5
226568,6040,1094,5


In [122]:
# Combine 'movie_id' and 'title' columns in movies_df
movies_df['combined_id_title'] = movies_df['movie_id'].astype(str) + ': ' + movies_df['title']


In [123]:
movies_df

Unnamed: 0,movie_id,title,genres,combined_content,clean_title,tokens,title_embedding,genres_list,combined_id_title
0,1,Toy Story (1995),Animation Children's Comedy,Toy Story Animation Children's Comedy,Toy Story,"[toy, story, (, 1995, )]","[-0.35110056, 0.20834465, -0.11387561, -0.0544...",[Animation Children's Comedy],1: Toy Story (1995)
1,2,Jumanji (1995),Adventure Children's Fantasy,Jumanji Adventure Children's Fantasy,Jumanji,"[jumanji, (, 1995, )]","[-0.39149696, 0.23860687, -0.1231528, -0.05967...",[Adventure Children's Fantasy],2: Jumanji (1995)
2,3,Grumpier Old Men (1995),Comedy Romance,Grumpier Old Men Comedy Romance,Grumpier Old Men,"[grumpier, old, men, (, 1995, )]","[-0.31545684, 0.18669872, -0.10016296, -0.0495...",[Comedy Romance],3: Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995),Comedy Drama,Waiting to Exhale Comedy Drama,Waiting to Exhale,"[waiting, to, exhale, (, 1995, )]","[-0.35059738, 0.2144134, -0.107361175, -0.0606...",[Comedy Drama],4: Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II Comedy,Father of the Bride Part II,"[father, of, the, bride, part, ii, (, 1995, )]","[-0.3623083, 0.21314543, -0.11584376, -0.06079...",[Comedy],5: Father of the Bride Part II (1995)
...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,Meet the Parents Comedy,Meet the Parents,"[meet, the, parents, (, 2000, )]","[-0.3444371, 0.20149797, -0.103228986, -0.0489...",[Comedy],3948: Meet the Parents (2000)
3879,3949,Requiem for a Dream (2000),Drama,Requiem for a Dream Drama,Requiem for a Dream,"[requiem, for, a, dream, (, 2000, )]","[-0.34421477, 0.20708129, -0.10991713, -0.0513...",[Drama],3949: Requiem for a Dream (2000)
3880,3950,Tigerland (2000),Drama,Tigerland Drama,Tigerland,"[tigerland, (, 2000, )]","[-0.38391754, 0.22588325, -0.121586286, -0.056...",[Drama],3950: Tigerland (2000)
3881,3951,Two Family House (2000),Drama,Two Family House Drama,Two Family House,"[two, family, house, (, 2000, )]","[-0.31579086, 0.19382073, -0.10466689, -0.0472...",[Drama],3951: Two Family House (2000)


In [None]:
def get_movie_title_from_id(movie_id):
    movie_slice = movies_df[movies_df['movie_id'] == movie_id]['title']
    
    if not movie_slice.empty:  # Check if the slice is not empty
        return movie_slice.iloc[0]
    else:
        return None  # or return some default string like 'Unknown'


In [None]:
def get_movie_id_from_title(title):
    return movies_df[movies_df['title'] == title]['movie_id'].iloc[0]


In [181]:
def get_user_recommendations_for_word2vec(user_id, num_recommendations=10):
    liked_movies = movie_data[movie_data['user_id'] == user_id]
    liked_movie_titles = [get_movie_title_from_id(mid) for mid in liked_movies[liked_movies['rating'] == 5]['movie_id'].tolist()]

    # Filter out titles that are not in the word2vec_matrix
    valid_titles = [title for title in liked_movie_titles if get_movie_id_from_title(title) < word2vec_matrix.shape[0]]

    if not valid_titles:
        return []

    # Create a user profile by averaging the Word2Vec vectors of movies they liked
    user_profile = np.mean([word2vec_matrix[get_movie_id_from_title(title)] for title in valid_titles], axis=0)

    sim_scores = linear_kernel([user_profile], word2vec_matrix)[0] 

    # Get the indices of the top movies based on similarity scores
    recommended_movie_ids = np.argsort(sim_scores)[-num_recommendations:][::-1]  # sorts and fetches top indices
    
    movie_titles = [get_movie_title_from_id(mid) for mid in recommended_movie_ids]

    return movie_titles


In [None]:
from tqdm import tqdm
def precision_at_k(user_id, k=10):
    recommended_movie_titles = get_user_recommendations_for_word2vec(user_id, k)
    liked_movies = movie_data[movie_data['user_id'] == user_id]
    liked_movie_titles = [get_movie_title_from_id(mid) for mid in liked_movies[liked_movies['rating'] == 5]['movie_id'].tolist()]
    # print(liked_movie_titles)
    # print(recommended_movie_titles)
    relevant_recommendations = len(set(recommended_movie_titles) & set(liked_movie_titles))

    
    return relevant_recommendations / k

all_users = movie_data['user_id'].unique()
average_P_at_k = np.mean([precision_at_k(user_id,10) for user_id in all_users])


In [187]:
print(f"Average Precision at k = 10: {average_P_at_k}")

Average Precision at k = 10: 0.015927152317880795


# END

In [186]:
get_user_recommendations_for_word2vec(1, 100)

['S.F.W. (1994)',
 'I Got the Hook Up (1998)',
 'Cook the Thief His Wife & Her Lover, The (1989)',
 None,
 'Out of the Past (1947)',
 'Exorcist II: The Heretic (1977)',
 'Homegrown (1998)',
 'Shaggy D.A., The (1976)',
 'Being John Malkovich (1999)',
 'Still Breathing (1997)',
 'Heartburn (1986)',
 'Birds, The (1963)',
 'Affair of Love, An (Une Liaison Pornographique) (1999)',
 'Symphonie pastorale, La (1946)',
 'Illtown (1996)',
 'Cobb (1994)',
 'Matilda (1996)',
 'Poison Ivy (1992)',
 'Great White Hype, The (1996)',
 'Die Hard 2 (1990)',
 'Touch of Evil (1958)',
 'Dumbo (1941)',
 'If Lucy Fell (1996)',
 'On Golden Pond (1981)',
 'Inventing the Abbotts (1997)',
 'Life Less Ordinary, A (1997)',
 'Moonstruck (1987)',
 'To Have, or Not (1995)',
 'Scream of Stone (Schrei aus Stein) (1991)',
 "Jumpin' Jack Flash (1986)",
 'Good Man in Africa, A (1994)',
 None,
 'Amos & Andrew (1993)',
 'People vs. Larry Flynt, The (1996)',
 "Someone Else's America (1995)",
 'Night Flier (1997)',
 'Easy Mone

In [134]:
liked_movies = movie_data[movie_data['user_id'] == 1]
liked_movie_titles = [get_movie_title_from_id(mid) for mid in liked_movies[liked_movies['rating'] == 5]['movie_id'].tolist()]


In [136]:
liked_movie_titles

["One Flew Over the Cuckoo's Nest (1975)",
 "Bug's Life, A (1998)",
 'Ben-Hur (1959)',
 'Christmas Story, A (1983)',
 'Beauty and the Beast (1991)',
 'Sound of Music, The (1965)',
 'Awakenings (1990)',
 'Back to the Future (1985)',
 "Schindler's List (1993)",
 'Pocahontas (1995)',
 'Last Days of Disco, The (1998)',
 'Cinderella (1950)',
 'Apollo 13 (1995)',
 'Toy Story (1995)',
 'Rain Man (1988)',
 'Mary Poppins (1964)',
 'Dumbo (1941)',
 'Saving Private Ryan (1998)']

In [138]:
valid_titles = [title for title in liked_movie_titles if get_movie_id_from_title(title) < word2vec_matrix.shape[0]]
valid_titles

["One Flew Over the Cuckoo's Nest (1975)",
 "Bug's Life, A (1998)",
 'Ben-Hur (1959)',
 'Christmas Story, A (1983)',
 'Beauty and the Beast (1991)',
 'Sound of Music, The (1965)',
 'Awakenings (1990)',
 'Back to the Future (1985)',
 "Schindler's List (1993)",
 'Pocahontas (1995)',
 'Last Days of Disco, The (1998)',
 'Cinderella (1950)',
 'Apollo 13 (1995)',
 'Toy Story (1995)',
 'Rain Man (1988)',
 'Mary Poppins (1964)',
 'Dumbo (1941)',
 'Saving Private Ryan (1998)']

In [140]:
# Create a user profile by averaging the Word2Vec vectors of movies they liked
user_profile = np.mean([word2vec_matrix[get_movie_id_from_title(title)] for title in valid_titles], axis=0)
user_profile

array([12.08581777, 12.41044991, 13.16569507, ..., 12.13722309,
       13.11593839, 12.0620283 ])

In [158]:
# Compute similarities between user profile and all movie vectors
sim_scores = list(enumerate(linear_kernel([user_profile], similarities)))

# Sort by similarity scores
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

# Get the top movie titles
movie_titles = [get_movie_title_from_id(i[0]) for i in sim_scores[:10]]


In [173]:
for i in sim_scores[:10]:
    print(i[1])
    print(get_movie_title_from_id(i[1]))

[581705.75959142 609740.68698731 618315.22157904 ... 599839.62936544
 626579.41029563 599967.27470421]
None


In [174]:
sim_scores

[(0,
  array([581705.75959142, 609740.68698731, 618315.22157904, ...,
         599839.62936544, 626579.41029563, 599967.27470421]))]

In [179]:
sim_scores = linear_kernel([user_profile], word2vec_matrix)[0] 

# Get the indices of the top movies based on similarity scores
recommended_movie_ids = np.argsort(sim_scores)[-10:][::-1]  # sorts and fetches top indices

movie_titles = [get_movie_title_from_id(mid) for mid in recommended_movie_ids]


In [180]:
movie_titles

['S.F.W. (1994)',
 'I Got the Hook Up (1998)',
 'Cook the Thief His Wife & Her Lover, The (1989)',
 None,
 'Out of the Past (1947)',
 'Exorcist II: The Heretic (1977)',
 'Homegrown (1998)',
 'Shaggy D.A., The (1976)',
 'Being John Malkovich (1999)',
 'Still Breathing (1997)']