In [13]:
# ...........Content Based Movie recommendation system............(Md Toufikzaman)

In [70]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [71]:
movies = pd.read_csv('movies.csv')
tags = pd.read_csv('tags.csv')

In [72]:
print(movies)

      movieId                                      title  \
0           1                           Toy Story (1995)   
1           2                             Jumanji (1995)   
2           3                    Grumpier Old Men (1995)   
3           4                   Waiting to Exhale (1995)   
4           5         Father of the Bride Part II (1995)   
...       ...                                        ...   
9737   193581  Black Butler: Book of the Atlantic (2017)   
9738   193583               No Game No Life: Zero (2017)   
9739   193585                               Flint (2017)   
9740   193587        Bungo Stray Dogs: Dead Apple (2018)   
9741   193609        Andrew Dice Clay: Dice Rules (1991)   

                                           genres  
0     Adventure|Animation|Children|Comedy|Fantasy  
1                      Adventure|Children|Fantasy  
2                                  Comedy|Romance  
3                            Comedy|Drama|Romance  
4                  

In [73]:
print(movies.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None


In [74]:
print(tags)

      userId  movieId               tag   timestamp
0          2    60756             funny  1445714994
1          2    60756   Highly quotable  1445714996
2          2    60756      will ferrell  1445714992
3          2    89774      Boxing story  1445715207
4          2    89774               MMA  1445715200
...      ...      ...               ...         ...
3678     606     7382         for katie  1171234019
3679     606     7936           austere  1173392334
3680     610     3265            gun fu  1493843984
3681     610     3265  heroic bloodshed  1493843978
3682     610   168248  Heroic Bloodshed  1493844270

[3683 rows x 4 columns]


In [75]:
print(tags.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB
None


In [76]:
movies.describe()

Unnamed: 0,movieId
count,9742.0
mean,42200.353623
std,52160.494854
min,1.0
25%,3248.25
50%,7300.0
75%,76232.0
max,193609.0


In [77]:
tags.describe()

Unnamed: 0,userId,movieId,timestamp
count,3683.0,3683.0,3683.0
mean,431.149335,27252.013576,1320032000.0
std,158.472553,43490.558803,172102500.0
min,2.0,1.0,1137179000.0
25%,424.0,1262.5,1137521000.0
50%,474.0,4454.0,1269833000.0
75%,477.0,39263.0,1498457000.0
max,610.0,193565.0,1537099000.0


In [78]:

# Merge datasets on movieId
movies = movies.merge(tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index(), on='movieId', how='left')

# Fill NaN values in 'tag' column with an empty string
movies['tag'] = movies['tag'].fillna('')

# Combine 'genres' and 'tag' columns into a single 'content' column
movies['content'] = movies['genres'] + ' ' + movies['tag']

# Create a TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the data to create the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(movies['content'])

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a Series to map movie titles to indices
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

def get_content_based_recommendations(title, cosine_sim=cosine_sim):
    if title not in indices:
        return "Movie not found in the dataset."
    
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    movie_indices = [i[0] for i in sim_scores[1:11]]
    return movies['title'].iloc[movie_indices]

# Test the function with an example movie title

print(get_content_based_recommendations('Die Hard (1988)') )#example 1


print(get_content_based_recommendations('Toy Story (1995)'))  #example 2





22                       Assassins (1995)
138     Die Hard: With a Vengeance (1995)
417                 Judgment Night (1993)
793                       Die Hard (1988)
1306                     Hard Rain (1998)
1315      Replacement Killers, The (1998)
1325                 U.S. Marshals (1998)
1693                         Ronin (1998)
2062                      No Mercy (1986)
2225      Someone to Watch Over Me (1987)
Name: title, dtype: object
1757                              Bug's Life, A (1998)
2355                                Toy Story 2 (1999)
8695                  Guardians of the Galaxy 2 (2017)
1706                                       Antz (1998)
2809    Adventures of Rocky and Bullwinkle, The (2000)
3000                  Emperor's New Groove, The (2000)
3568                             Monsters, Inc. (2001)
6194                                  Wild, The (2006)
6486                            Shrek the Third (2007)
6948                    Tale of Despereaux, The (2008)
Nam