In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
import nltk

### Data Acqutition

In [4]:
from nltk.tokenize import word_tokenize

In [5]:
dataframe = pd.read_csv("dataset/movie.csv")

In [6]:
dataframe.head()

Unnamed: 0,title,genres,cast,keywords,director,metadata
0,Toy Story,animation comedy family,tom_hanks tim_allen don_rickles,jealousy toy boy,john_lasseter,animation comedy family tom_hanks tim_allen do...
1,Jumanji,adventure fantasy family,robin_williams jonathan_hyde kirsten_dunst,board_game disappearance based_on_children's_book,joe_johnston,adventure fantasy family robin_williams jonath...
2,Grumpier Old Men,romance comedy,walter_matthau jack_lemmon ann-margret,fishing best_friend duringcreditsstinger,howard_deutch,romance comedy walter_matthau jack_lemmon ann-...
3,Waiting to Exhale,comedy drama romance,whitney_houston angela_bassett loretta_devine,based_on_novel interracial_relationship single...,forest_whitaker,comedy drama romance whitney_houston angela_ba...
4,Father of the Bride Part II,comedy,steve_martin diane_keaton martin_short,baby midlife_crisis confidence,charles_shyer,comedy steve_martin diane_keaton martin_short ...


In [7]:
dataframe

Unnamed: 0,title,genres,cast,keywords,director,metadata
0,Toy Story,animation comedy family,tom_hanks tim_allen don_rickles,jealousy toy boy,john_lasseter,animation comedy family tom_hanks tim_allen do...
1,Jumanji,adventure fantasy family,robin_williams jonathan_hyde kirsten_dunst,board_game disappearance based_on_children's_book,joe_johnston,adventure fantasy family robin_williams jonath...
2,Grumpier Old Men,romance comedy,walter_matthau jack_lemmon ann-margret,fishing best_friend duringcreditsstinger,howard_deutch,romance comedy walter_matthau jack_lemmon ann-...
3,Waiting to Exhale,comedy drama romance,whitney_houston angela_bassett loretta_devine,based_on_novel interracial_relationship single...,forest_whitaker,comedy drama romance whitney_houston angela_ba...
4,Father of the Bride Part II,comedy,steve_martin diane_keaton martin_short,baby midlife_crisis confidence,charles_shyer,comedy steve_martin diane_keaton martin_short ...
...,...,...,...,...,...,...
42272,Caged Heat 3000,science_fiction,jimmy_bennett peter_cullen jim_cummings,,saul_blinkoff,science_fiction jimmy_bennett peter_cullen jim...
42273,Subdue,drama family,jeanne_d'alcy georges_méliès,,georges_méliès,drama family jeanne_d'alcy georges_méliès geo...
42274,Century of Birthing,drama,,,robert_gardner,drama robert_gardner
42275,Satan Triumphant,,markie_adams roberto_aguire tina_arning,woman_director,shanra_j._kehl,markie_adams roberto_aguire tina_arning woman...


In [8]:
dataframe["metadata"]=dataframe["title"].str.cat(dataframe["metadata"], sep=", ").to_frame()
dataframe = dataframe[["title", "metadata"]][0:500]

In [9]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     500 non-null    object
 1   metadata  500 non-null    object
dtypes: object(2)
memory usage: 7.9+ KB


### Data Preprocessing

In [10]:
import re

def text_preprocessing(text):
    text = str(text).lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[-+]?[0-9]+', '', text)
    text = re.sub(r'[^\w\s]','', text) 
    text = text.strip()
    return text

In [11]:
dataframe["metadata"] = dataframe["metadata"].apply(text_preprocessing)
dataframe["title"] = dataframe["title"].apply(text_preprocessing)

In [26]:
dataframe

Unnamed: 0,title,metadata
0,toy story,toy story animation comedy family tom_hanks ti...
1,jumanji,jumanji adventure fantasy family robin_william...
2,grumpier old men,grumpier old men romance comedy walter_matthau...
3,waiting to exhale,waiting to exhale comedy drama romance whitney...
4,father of the bride part ii,father of the bride part ii comedy steve_marti...
...,...,...
495,mrs doubtfire,mrs doubtfire comedy drama family robin_willia...
496,naked,naked comedy drama david_thewlis lesley_sharp ...
497,the next karate kid,the next karate kid adventure pat_morita hilar...
498,the new age,the new age comedy drama peter_weller judy_dav...


In [13]:
dataframe.dropna(inplace=True)

In [14]:
dataframe.isna().sum()

title       0
metadata    0
dtype: int64

In [15]:
dataframe["title"][0]

'toy story'

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

### Modelling Content Based Filtering

In [42]:
cv = CountVectorizer(max_features=10, stop_words='english')

In [43]:
vektors = cv.fit_transform(dataframe["metadata"]).toarray()

In [44]:
vektors

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 1, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 1, 0]], dtype=int64)

In [45]:
vektors[0]

array([0, 0, 1, 0, 0, 1, 0, 0, 0, 0], dtype=int64)

In [46]:
cv.get_feature_names()



['action',
 'adventure',
 'comedy',
 'crime',
 'drama',
 'family',
 'fantasy',
 'romance',
 'science_fiction',
 'thriller']

In [47]:
from sklearn.metrics.pairwise import cosine_similarity

In [48]:
similiaryty = cosine_similarity(vektors)

In [61]:
def recommended(movie):
    movie_index = dataframe[dataframe["title"] == movie].index[0]
    distances = similiaryty[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]
    
    for i in movies_list:
        print(dataframe.iloc[i[0]].title)

In [62]:
recommended("toy story")

big bully
houseguest
rentakid
the air up there
life with mikey
