In [1]:
import pandas as pd

df = pd.read_csv('movie_data.csv')

In [2]:
df.head()

Unnamed: 0,Original Language,Original Title,Overview,Popularity,Poster Path,Release Date,Title,Video,Vote Average,Vote Count
0,en,Fast X,Over many missions and against impossible odds...,4654.279,/fiVW06jE7z9YnO4trhaMEdclSiC.jpg,2023-05-17,Fast X,False,7.3,2093
1,en,Extraction 2,Tasked with extracting a family who is at the ...,2494.01,/7gKI9hpEMcZUQpNgKrkDzJpbnNS.jpg,2023-06-09,Extraction 2,False,7.7,910
2,en,John Wick: Chapter 4,"With the price on his head ever increasing, Jo...",1920.127,/vZloFAK7NmvMGKE7VkF5UHaz0I.jpg,2023-03-22,John Wick: Chapter 4,False,7.9,3344
3,en,Spider-Man: Across the Spider-Verse,"After reuniting with Gwen Stacy, Brooklyn’s fu...",2013.795,/8Vt6mWEReuy4Of61Lnj5Xj704m8.jpg,2023-05-31,Spider-Man: Across the Spider-Verse,False,8.6,1796
4,en,The Super Mario Bros. Movie,"While working underground to fix a water main,...",1539.037,/qNBAXBIQlnOThrVvA6mA2B5ggV6.jpg,2023-04-05,The Super Mario Bros. Movie,False,7.8,5165


In [3]:
df.columns

Index(['Original Language', 'Original Title', 'Overview', 'Popularity',
       'Poster Path', 'Release Date', 'Title', 'Video', 'Vote Average',
       'Vote Count'],
      dtype='object')

In [4]:
df = df.drop('Poster Path',axis=1)

In [5]:
df = df.drop(['Original Language','Video','Title','Release Date'], axis =1)

In [6]:
df.head()


Unnamed: 0,Original Title,Overview,Popularity,Vote Average,Vote Count
0,Fast X,Over many missions and against impossible odds...,4654.279,7.3,2093
1,Extraction 2,Tasked with extracting a family who is at the ...,2494.01,7.7,910
2,John Wick: Chapter 4,"With the price on his head ever increasing, Jo...",1920.127,7.9,3344
3,Spider-Man: Across the Spider-Verse,"After reuniting with Gwen Stacy, Brooklyn’s fu...",2013.795,8.6,1796
4,The Super Mario Bros. Movie,"While working underground to fix a water main,...",1539.037,7.8,5165


In [7]:
df.isnull().sum()
df.shape

(10000, 5)

In [8]:
# removing Null values in dataset
df.dropna(inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9937 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Original Title  9937 non-null   object 
 1   Overview        9937 non-null   object 
 2   Popularity      9937 non-null   float64
 3   Vote Average    9937 non-null   float64
 4   Vote Count      9937 non-null   int64  
dtypes: float64(2), int64(1), object(2)
memory usage: 465.8+ KB


In [10]:
correlation = df['Vote Count'].corr(df['Vote Average'])
print(correlation)


0.25463492024811


In [11]:
C = df['Vote Average'].mean()
C

6.377498238905103

In [12]:
m = df['Vote Count'].quantile(0.9)
m

4089.3999999999996

In [13]:
q_movies = df.copy().loc[df['Vote Count'] >= m]
q_movies.shape

(994, 5)

In [14]:
def weighted_rating(x, m = m,C = C):
    v = x['Vote Count']
    R = x['Vote Average']
    #Calculation based on the IMDB formula
    return (v/(v+m) *R + (m/(m+v)) * C)



In [16]:
#Adding New Rating Column Using the IMDB formula
df['Weighted Rating'] = df.apply(weighted_rating, axis=1)

In [17]:
print(df[['Original Title', 'Weighted Rating']])

                           Original Title  Weighted Rating
0                                  Fast X         6.689804
1                            Extraction 2         6.618222
2                    John Wick: Chapter 4         7.062413
3     Spider-Man: Across the Spider-Verse         7.055721
4             The Super Mario Bros. Movie         7.171415
...                                   ...              ...
9995                           Jessabelle         6.231852
9996                                 Elio         6.377498
9997                 Legado en los huesos         6.395868
9998                Behind the Candelabra         6.424691
9999                            Like Mike         6.350375

[9937 rows x 2 columns]


In [18]:
#So now we have added the Rating We can go through Buliding content based learning
from sklearn.feature_extraction.text import TfidfVectorizer

'''we create an instance of TfidfVectorizer. The stop_words='english' 
argument specifies that common English words like "the," "and," and 
"is" should be removed from the text data because they typically 
don't carry much meaning.'''

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(df['Overview'])


In [19]:
from sklearn.metrics.pairwise import linear_kernel

cosine_sim = linear_kernel(tfidf_matrix,tfidf_matrix)

In [20]:
def get_recommendations(movie_title, cosine_sim=cosine_sim, num_recommendations=10):
    idx = df[df['Original Title'] == movie_title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:(num_recommendations+1)]  # Top N similar movies (excluding itself)
    movie_indices = [i[0] for i in sim_scores]
    return df[['Original Title', 'Weighted Rating']].iloc[movie_indices]

In [23]:
# let try out

input_movie = "Harry Potter and the Philosopher's Stone"
recommendations = get_recommendations(input_movie)
recommendations

Unnamed: 0,Original Title,Weighted Rating
916,The Big 4,6.401078
3292,Sexy Movie,6.374973
3177,The Three Musketeers,6.161857
3709,IO,6.383482
1470,Die unendliche Geschichte,6.763823
2898,Indecent Proposal,6.359612
1560,ปฏิบัติการกู้หวย,6.378257
800,Fantastic Beasts and Where to Find Them,7.126771
8786,Friendzone,6.356325
1941,The Lucky One,6.70359
