In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
md = pd.read_csv('metadata.csv')

In [3]:
md.head(15)

Unnamed: 0,id,original_language,original_title,overview,popularity,release_date,title,vote_average,vote_count
0,851644,ko,20세기 소녀,Yeon-du asks her best friend Bora to collect a...,170.54,2022-10-06,20th Century Girl,8.7,290
1,238,en,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",86.518,1972-03-14,The Godfather,8.7,16988
2,278,en,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,84.681,1994-09-23,The Shawshank Redemption,8.7,22748
3,240,en,The Godfather Part II,In the continuing saga of the Corleone crime f...,44.957,1974-12-20,The Godfather Part II,8.6,10293
4,667257,es,Cosas imposibles,"Matilde is a woman who, after the death of her...",32.859,2021-06-17,Impossible Things,8.6,299
5,19404,hi,दिलवाले दुल्हनिया ले जायेंगे,"Raj is a rich, carefree, happy-go-lucky second...",23.31,1995-10-19,Dilwale Dulhania Le Jayenge,8.6,3961
6,424,en,Schindler's List,The true story of how businessman Oskar Schind...,52.121,1993-12-15,Schindler's List,8.6,13486
7,620249,zh,罗小黑战记,"In the bustling human world, spirits live peac...",18.207,2019-08-27,The Legend of Hei,8.6,215
8,372754,ja,同級生,"Rihito Sajo, an honor student with a perfect s...",11.288,2016-02-20,Dou kyu sei – Classmates,8.5,263
9,129,ja,千と千尋の神隠し,"A young girl, Chihiro, becomes trapped in a st...",64.948,2001-07-20,Spirited Away,8.5,13595


In [4]:
md.iloc[0:3].transpose()

Unnamed: 0,0,1,2
id,851644,238,278
original_language,ko,en,en
original_title,20세기 소녀,The Godfather,The Shawshank Redemption
overview,Yeon-du asks her best friend Bora to collect a...,"Spanning the years 1945 to 1955, a chronicle o...",Framed in the 1940s for the double murder of h...
popularity,170.54,86.518,84.681
release_date,2022-10-06,1972-03-14,1994-09-23
title,20th Century Girl,The Godfather,The Shawshank Redemption
vote_average,8.7,8.7,8.7
vote_count,290,16988,22748


In [5]:
md.shape

(10000, 9)

In [6]:
md.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 10000 non-null  int64  
 1   original_language  10000 non-null  object 
 2   original_title     10000 non-null  object 
 3   overview           9994 non-null   object 
 4   popularity         10000 non-null  float64
 5   release_date       10000 non-null  object 
 6   title              10000 non-null  object 
 7   vote_average       10000 non-null  float64
 8   vote_count         10000 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 703.2+ KB


In [7]:
null_rows_specific_column = md[md['overview'].isnull()]

In [8]:
print(null_rows_specific_column)

          id original_language            original_title overview  popularity  \
3957    9762                en                   Step Up      NaN      21.672   
7393   31359                fr  La Vérité si je mens ! 2      NaN       6.522   
8341    2029                fr                    Tanguy      NaN       7.111   
8833   57114                en   Amore, bugie e calcetto      NaN       6.172   
9609   17413                fr                 Incognito      NaN       7.490   
9993  154512                it          Colpi di fulmine      NaN       5.767   

     release_date                    title  vote_average  vote_count  
3957   2006-08-11                  Step Up           6.9        3342  
7393   2001-02-07    Would I Lie to You? 2           6.2         336  
8341   2001-11-21                   Tanguy           5.9         398  
8833   2008-04-04  Amore, bugie e calcetto           5.8         203  
9609   2009-04-28                Incognito           5.5         216  
9993  

In [9]:
#metadata_dr
md_dr = md.dropna(subset=['overview'])

In [10]:
md_dr.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9994 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 9994 non-null   int64  
 1   original_language  9994 non-null   object 
 2   original_title     9994 non-null   object 
 3   overview           9994 non-null   object 
 4   popularity         9994 non-null   float64
 5   release_date       9994 non-null   object 
 6   title              9994 non-null   object 
 7   vote_average       9994 non-null   float64
 8   vote_count         9994 non-null   int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 780.8+ KB


In [11]:
md_dr.to_csv('metadata.csv', index=False)


$\large Weighted\; Rating (WR) = (\frac{v}{v + m} . R) + (\frac{m}{v + m} . C)$
```

    v filme verilen oy sayısı
    m listede yer alması için alması gereken minimum oy
    R filmin ortalama puanıdır
    C listedeki tüm oyların ortalaması
    
    
```


In [12]:

#v
vote_counts = md_dr['vote_count'].astype('int')
#r
vote_averages = md_dr['vote_average'].astype('int')
#c
vote_averages_mean = md_dr['vote_average'].mean()

In [13]:
m = vote_counts.quantile(0.95)
print(m)

6497.050000000001


In [14]:
qualified1 = md_dr[(md_dr['vote_count'] >= m)][['id','original_language','original_title','overview','popularity','release_date','title','vote_average','vote_count']]

In [15]:
qualified1 = qualified1.reset_index(drop=True)

In [16]:
qualified1.head(250)

Unnamed: 0,id,original_language,original_title,overview,popularity,release_date,title,vote_average,vote_count
0,238,en,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",86.518,1972-03-14,The Godfather,8.7,16988
1,278,en,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,84.681,1994-09-23,The Shawshank Redemption,8.7,22748
2,240,en,The Godfather Part II,In the continuing saga of the Corleone crime f...,44.957,1974-12-20,The Godfather Part II,8.6,10293
3,424,en,Schindler's List,The true story of how businessman Oskar Schind...,52.121,1993-12-15,Schindler's List,8.6,13486
4,129,ja,千と千尋の神隠し,"A young girl, Chihiro, becomes trapped in a st...",64.948,2001-07-20,Spirited Away,8.5,13595
...,...,...,...,...,...,...,...,...,...
245,17654,en,District 9,"Thirty years ago, aliens arrive on Earth. Not ...",27.638,2009-08-05,District 9,7.4,8214
246,2503,en,The Bourne Ultimatum,Bourne is brought out of hiding once again by ...,28.662,2007-08-03,The Bourne Ultimatum,7.4,6705
247,353081,en,Mission: Impossible - Fallout,"When an IMF mission ends badly, the world is f...",41.682,2018-07-13,Mission: Impossible - Fallout,7.4,6871
248,447332,en,A Quiet Place,A family is forced to live in silence while hi...,44.654,2018-04-03,A Quiet Place,7.4,12187


In [17]:
qualified1.shape

(500, 9)

simple rec systems (popularity based)

In [18]:
def weighted_rating(x):
    vote_counts=x['vote_count']
    vote_averages=x['vote_average']
    return (vote_counts/(vote_counts+m) * vote_averages) + (m/(m+vote_counts) * vote_averages_mean)

In [19]:
qualified1['wr'] = qualified1.apply(weighted_rating, axis=1)


### Results For Weighted Ratings

In [20]:
qualified1.sort_values('wr', ascending=False).head()

Unnamed: 0,id,original_language,original_title,overview,popularity,release_date,title,vote_average,vote_count,wr
1,278,en,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,84.681,1994-09-23,The Shawshank Redemption,8.7,22748,8.251696
9,155,en,The Dark Knight,Batman raises the stakes in his war on crime. ...,58.811,2008-07-14,The Dark Knight,8.5,28656,8.164005
0,238,en,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",86.518,1972-03-14,The Godfather,8.7,16988,8.141744
28,27205,en,Inception,"Cobb, a skilled thief who commits corporate es...",58.274,2010-07-15,Inception,8.4,32679,8.115093
10,680,en,Pulp Fiction,"A burger-loving hit man, his philosophical par...",60.289,1994-09-10,Pulp Fiction,8.5,24122,8.114252


### TF-IDF

In [21]:
md_dr.head().transpose()

Unnamed: 0,0,1,2,3,4
id,851644,238,278,240,667257
original_language,ko,en,en,en,es
original_title,20세기 소녀,The Godfather,The Shawshank Redemption,The Godfather Part II,Cosas imposibles
overview,Yeon-du asks her best friend Bora to collect a...,"Spanning the years 1945 to 1955, a chronicle o...",Framed in the 1940s for the double murder of h...,In the continuing saga of the Corleone crime f...,"Matilde is a woman who, after the death of her..."
popularity,170.54,86.518,84.681,44.957,32.859
release_date,2022-10-06,1972-03-14,1994-09-23,1974-12-20,2021-06-17
title,20th Century Girl,The Godfather,The Shawshank Redemption,The Godfather Part II,Impossible Things
vote_average,8.7,8.7,8.7,8.6,8.6
vote_count,290,16988,22748,10293,299


In [23]:
md_dr.loc[:, 'fortfidf'] = md_dr['title'] + ' ' + md_dr['overview']

In [24]:
md_dr['fortfidf']

0       20th Century Girl Yeon-du asks her best friend...
1       The Godfather Spanning the years 1945 to 1955,...
2       The Shawshank Redemption Framed in the 1940s f...
3       The Godfather Part II In the continuing saga o...
4       Impossible Things Matilde is a woman who, afte...
                              ...                        
9995    Hollywood Homicide Joe Gavilan and his new par...
9996    Do Not Disturb Michel, who's crazy about jazz,...
9997    Apollo 18 Officially, Apollo 17 was the last m...
9998    Apartment 143 A team of parapsychologists sets...
9999    Nekromantik A street sweeper who cleans up aft...
Name: fortfidf, Length: 9994, dtype: object

In [25]:
md_dr.loc[:, 'fortfidf'] = md_dr['fortfidf'].fillna('')

In [26]:
md_dr.loc[:, 'fortfidf'] = md_dr['fortfidf'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in ENGLISH_STOP_WORDS]))

In [27]:
md_dr['fortfidf']

0       20th Century Girl Yeon-du asks best friend Bor...
1       Godfather Spanning years 1945 1955, chronicle ...
2       Shawshank Redemption Framed 1940s double murde...
3       Godfather II continuing saga Corleone crime fa...
4       Impossible Things Matilde woman who, death hus...
                              ...                        
9995    Hollywood Homicide Joe Gavilan new partner K. ...
9996    Disturb Michel, who's crazy jazz, just rare al...
9997    Apollo 18 Officially, Apollo 17 manned mission...
9998    Apartment 143 team parapsychologists sets inve...
9999    Nekromantik street sweeper cleans grisly accid...
Name: fortfidf, Length: 9994, dtype: object

In [28]:
md_dr.loc[:, 'fortfidf'] = md_dr['fortfidf'].str.lower()

In [29]:
md_dr.loc[:, 'fortfidf'] = md_dr['fortfidf'].apply(lambda x: word_tokenize(x))

In [30]:
md_dr['fortfidf']

0       [20th, century, girl, yeon-du, asks, best, fri...
1       [godfather, spanning, years, 1945, 1955, ,, ch...
2       [shawshank, redemption, framed, 1940s, double,...
3       [godfather, ii, continuing, saga, corleone, cr...
4       [impossible, things, matilde, woman, who, ,, d...
                              ...                        
9995    [hollywood, homicide, joe, gavilan, new, partn...
9996    [disturb, michel, ,, who, 's, crazy, jazz, ,, ...
9997    [apollo, 18, officially, ,, apollo, 17, manned...
9998    [apartment, 143, team, parapsychologists, sets...
9999    [nekromantik, street, sweeper, cleans, grisly,...
Name: fortfidf, Length: 9994, dtype: object

In [31]:
stemmer = SnowballStemmer('english')

def stemming(x):
  if isinstance(x, list):
    return [stemmer.stem(word) for word in x]
  elif isinstance(x,str):
    return [stemmer.stem(word) for word in x.split()]
  else:
    return x

In [32]:
md_dr.loc[:, 'fortfidf'] = md_dr['fortfidf'].apply(stemming)

In [33]:
md_dr['fortfidf']

0       [20th, centuri, girl, yeon-du, ask, best, frie...
1       [godfath, span, year, 1945, 1955, ,, chronicl,...
2       [shawshank, redempt, frame, 1940s, doubl, murd...
3       [godfath, ii, continu, saga, corleon, crime, f...
4       [imposs, thing, matild, woman, who, ,, death, ...
                              ...                        
9995    [hollywood, homicid, joe, gavilan, new, partne...
9996    [disturb, michel, ,, who, 's, crazi, jazz, ,, ...
9997    [apollo, 18, offici, ,, apollo, 17, man, missi...
9998    [apart, 143, team, parapsychologist, set, inve...
9999    [nekromantik, street, sweeper, clean, grisli, ...
Name: fortfidf, Length: 9994, dtype: object

In [34]:
md_dr.loc[:, 'fortfidf'] = md_dr['fortfidf'].apply(lambda x: str(x) if not isinstance(x, str) else x)

In [35]:
md_dr['fortfidf']

0       ['20th', 'centuri', 'girl', 'yeon-du', 'ask', ...
1       ['godfath', 'span', 'year', '1945', '1955', ',...
2       ['shawshank', 'redempt', 'frame', '1940s', 'do...
3       ['godfath', 'ii', 'continu', 'saga', 'corleon'...
4       ['imposs', 'thing', 'matild', 'woman', 'who', ...
                              ...                        
9995    ['hollywood', 'homicid', 'joe', 'gavilan', 'ne...
9996    ['disturb', 'michel', ',', 'who', "'s", 'crazi...
9997    ['apollo', '18', 'offici', ',', 'apollo', '17'...
9998    ['apart', '143', 'team', 'parapsychologist', '...
9999    ['nekromantik', 'street', 'sweeper', 'clean', ...
Name: fortfidf, Length: 9994, dtype: object

In [36]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words=('english'))
tfidf_matrix = tf.fit_transform(md_dr['fortfidf'])

In [37]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [38]:
cosine_sim

array([[1.        , 0.00182178, 0.0015701 , ..., 0.00166062, 0.        ,
        0.        ],
       [0.00182178, 1.        , 0.00177424, ..., 0.00613234, 0.00995291,
        0.        ],
       [0.0015701 , 0.00177424, 1.        , ..., 0.        , 0.        ,
        0.00978911],
       ...,
       [0.00166062, 0.00613234, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.00995291, 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.00978911, ..., 0.        , 0.        ,
        1.        ]])

In [42]:
indices = pd.Series(md_dr.index, index=md_dr['title']).drop_duplicates()

In [43]:
indices.head(5)

title
20th Century Girl           0
The Godfather               1
The Shawshank Redemption    2
The Godfather Part II       3
Impossible Things           4
dtype: int64

In [44]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return md_dr['title'].iloc[movie_indices]

In [45]:
get_recommendations('Se7en')

645     The Seven Deadly Sins: Cursed by Light
9240                                   Hangman
3725                           Serial Killer 1
8348                              Split Second
5846                                   Copycat
1730                      Memoir of a Murderer
5617                                Kalifornia
5035                    The Poughkeepsie Tapes
7562                         Murder by Numbers
7629                                 Tightrope
Name: title, dtype: object

In [46]:
get_recommendations('The Sixth Sense')

4790                            Border
9898                           Shut In
3268                          Eternals
3792                      We Die Young
9877                         Incarnate
409     The Boy Who Harnessed the Wind
2547                        Little Boy
7527            Friday the 13th Part 2
891                 Germany, Year Zero
1666                          El Angel
Name: title, dtype: object

In [47]:
get_recommendations('The Godfather')

3                  The Godfather Part II
9759                          Proud Mary
1660              The Godfather Part III
7699                          Blood Ties
9887                               Gotti
791     The Gangster, the Cop, the Devil
3469                             8 Women
6433                      Prizzi's Honor
6255                       Run All Night
1976                       Loose Cannons
Name: title, dtype: object