In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv('movie_content.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Id,Plot,Title,Rating
0,0,2,Taisto Kasurinen is a Finnish coal miner whose...,Ariel (1988),3.6
1,1,3,"An episode in the life of Nikander, a garbage ...",Shadows in Paradise (1986),3.65
2,2,5,It's Ted the Bellhop's first night on the job....,Four Rooms (1995),3.3
3,3,6,"While racing to a boxing match, Frank, Mike, J...",Judgment Night (1993),3.25
4,4,8,Timo Novotny labels his new project an experim...,Life in Loops (A Megacities RMX) (2006),3.2


In [4]:
df.columns

Index(['Unnamed: 0', 'Id', 'Plot', 'Title', 'Rating'], dtype='object')

In [5]:
df.drop(columns = ['Unnamed: 0'], axis = 1, inplace = True)

In [6]:
df.head()

Unnamed: 0,Id,Plot,Title,Rating
0,2,Taisto Kasurinen is a Finnish coal miner whose...,Ariel (1988),3.6
1,3,"An episode in the life of Nikander, a garbage ...",Shadows in Paradise (1986),3.65
2,5,It's Ted the Bellhop's first night on the job....,Four Rooms (1995),3.3
3,6,"While racing to a boxing match, Frank, Mike, J...",Judgment Night (1993),3.25
4,8,Timo Novotny labels his new project an experim...,Life in Loops (A Megacities RMX) (2006),3.2


In [7]:
df.isnull().sum()

Id          0
Plot      194
Title       0
Rating      0
dtype: int64

In [8]:
df.dropna(axis = 0, how = 'any', inplace = True)

In [9]:
df.isnull().sum()

Id        0
Plot      0
Title     0
Rating    0
dtype: int64

In [10]:
Ariel = df['Plot'][0]

In [11]:
Four_Rooms = df['Plot'][2]

In [12]:
corpus = df['Plot']

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvec = TfidfVectorizer(stop_words='english')
tvec.fit(corpus)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [14]:
tf_df  = pd.DataFrame(tvec.transform(corpus).todense(),
                   columns=tvec.get_feature_names(),
                     index = df['Title'])
#tf_df.transpose().sort_values(df['Title'][0], ascending=False).head(10).transpose()

In [15]:
similarity_matrix = cosine_similarity(tf_df)

In [16]:
similarity_matrix.shape

(8795, 8795)

In [17]:
tf_df.shape

(8795, 29554)

In [18]:
tf_df.head(3)

Unnamed: 0_level_0,00,000,007,04,05pm,07am,0ne,10,100,1000,...,übrig,łucja,łódź,świat,şinasi,że,życie,تم,ہم,हम
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ariel (1988),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Shadows in Paradise (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Four Rooms (1995),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
sim = pd.DataFrame(similarity_matrix, columns = tf_df.T.columns, index = tf_df.T.columns)

In [20]:
sim.head()

Title,Ariel (1988),Shadows in Paradise (1986),Four Rooms (1995),Judgment Night (1993),Life in Loops (A Megacities RMX) (2006),Star Wars (1977),Finding Nemo (2003),Forrest Gump (1994),American Beauty (1999),Citizen Kane (1941),...,Welcome to Dongmakgol (2005),The Colt (2005),Noah's Arc: Jumping the Broom (2008),Rachel Getting Married (2008),Red Sands (2009),Moog (2004),Thick as Thieves (2009),Welcome To Macintosh (2008),House of Fools (2008),Carver (2008)
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ariel (1988),1.0,0.015763,0.0,0.0,0.016255,0.0,0.010445,0.045945,0.039606,0.009167,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.021606,0.0,0.0
Shadows in Paradise (1986),0.015763,1.0,0.0,0.0,0.006325,0.0,0.0,0.042894,0.037155,0.015477,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Four Rooms (1995),0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.025602,0.0,0.0,0.0
Judgment Night (1993),0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.019793,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Life in Loops (A Megacities RMX) (2006),0.016255,0.006325,0.0,0.0,1.0,0.0,0.007831,0.003052,0.008831,0.024134,...,0.004754,0.0,0.0,0.0,0.0,0.036449,0.0,0.02438,0.0,0.0


In [21]:
distances = pairwise_distances(sim, metric='cosine')

In [22]:
df_distances = pd.DataFrame(distances, columns = tf_df.T.columns, index = tf_df.T.columns)

In [23]:
titles = []
for i in df_distances.columns:
    titles.append(i)

In [24]:
distance = []
for i in df_distances['Jurassic Park (1993)']:
    distance.append(i)

In [25]:
distance2 = []
for i in df_distances['Gladiator (2000)']:
    distance2.append(i)

In [26]:
df_work = pd.DataFrame(data = titles, columns = ['Jurassic Park (1993)'])

In [27]:
df_work.head()

Unnamed: 0,Jurassic Park (1993)
0,Ariel (1988)
1,Shadows in Paradise (1986)
2,Four Rooms (1995)
3,Judgment Night (1993)
4,Life in Loops (A Megacities RMX) (2006)


In [28]:
df_work['Gladiator (2000)'] = titles

In [29]:
df_work.head()

Unnamed: 0,Jurassic Park (1993),Gladiator (2000)
0,Ariel (1988),Ariel (1988)
1,Shadows in Paradise (1986),Shadows in Paradise (1986)
2,Four Rooms (1995),Four Rooms (1995)
3,Judgment Night (1993),Judgment Night (1993)
4,Life in Loops (A Megacities RMX) (2006),Life in Loops (A Megacities RMX) (2006)


In [30]:
df_work['JP Distance'] = distance
df_work['Int. Distance'] = distance2
df_work.head()

Unnamed: 0,Jurassic Park (1993),Gladiator (2000),JP Distance,Int. Distance
0,Ariel (1988),Ariel (1988),0.873052,0.845264
1,Shadows in Paradise (1986),Shadows in Paradise (1986),0.912159,0.792626
2,Four Rooms (1995),Four Rooms (1995),0.937784,0.952365
3,Judgment Night (1993),Judgment Night (1993),0.924707,0.91905
4,Life in Loops (A Megacities RMX) (2006),Life in Loops (A Megacities RMX) (2006),0.890247,0.892975


In [31]:
df_work['Average Distance'] = (df_work['JP Distance'] + df_work['Int. Distance'])/2

In [32]:
df_distances.head()

Title,Ariel (1988),Shadows in Paradise (1986),Four Rooms (1995),Judgment Night (1993),Life in Loops (A Megacities RMX) (2006),Star Wars (1977),Finding Nemo (2003),Forrest Gump (1994),American Beauty (1999),Citizen Kane (1941),...,Welcome to Dongmakgol (2005),The Colt (2005),Noah's Arc: Jumping the Broom (2008),Rachel Getting Married (2008),Red Sands (2009),Moog (2004),Thick as Thieves (2009),Welcome To Macintosh (2008),House of Fools (2008),Carver (2008)
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ariel (1988),0.0,0.771847,0.891335,0.872431,0.771155,0.916904,0.827342,0.707977,0.672673,0.792405,...,0.847622,0.899828,0.882474,0.790084,0.904038,0.926903,0.89207,0.806805,0.972946,0.912891
Shadows in Paradise (1986),0.771847,0.0,0.930969,0.905655,0.830306,0.93614,0.888772,0.640358,0.691692,0.813177,...,0.867705,0.912135,0.911706,0.810269,0.91193,0.925046,0.925505,0.89674,0.987882,0.909699
Four Rooms (1995),0.891335,0.930969,0.0,0.940108,0.91208,0.954395,0.936072,0.897598,0.91086,0.921313,...,0.930926,0.962113,0.94075,0.895142,0.94005,0.960425,0.876609,0.930095,0.988695,0.943828
Judgment Night (1993),0.872431,0.905655,0.940108,0.0,0.891609,0.928039,0.919865,0.88696,0.859064,0.908352,...,0.895587,0.932819,0.933069,0.902483,0.934391,0.944237,0.934676,0.915044,0.9894,0.939435
Life in Loops (A Megacities RMX) (2006),0.771155,0.830306,0.91208,0.891609,0.0,0.920283,0.868961,0.795277,0.80256,0.808141,...,0.791499,0.881657,0.900467,0.846647,0.89146,0.774311,0.912689,0.753796,0.980683,0.882332


In [33]:
def movie_generator(movie1, movie2):
    
    
#     search = movie1
#     for title in moviedf.loc[moviedf['title'].str.contains(search), 'title'].values:
#         print(title)
#         print('Average', pivot[title].mean())
#         print('Number of ratings', pivot[title].count())
#         print('')
#         print('')
    
    
    
    titles = []
    for i in df_distances.columns:
        titles.append(i)
    
    distance = []
    for i in df_distances[movie1]:
        distance.append(i)
    
    distance2 = []
    for i in df_distances[movie2]:
        distance2.append(i)

    df_generator = pd.DataFrame(data = titles, columns = [movie1])
    df_generator[movie2] = titles

    df_generator[movie1 + 'Distance'] = distance
    df_generator[movie2 + 'Distance'] = distance2

    df_generator['Average Distance'] = (df_generator[movie1 + 'Distance'] + df_generator[movie2 + 'Distance'])/2
    df_generator['Average Similarity'] = ((1-df_generator[movie1 + 'Distance']) + (1-df_generator[movie2 + 'Distance'])/2)
    df_generator['Weight'] = 1 - (np.abs(df_generator[movie1 + 'Distance']-df_generator[movie2 + 'Distance']))
    df_generator['Score'] = df_generator['Weight'] * df_generator['Average Distance']
    #df_generator[movie1 + ' and ' + movie2] = df_generator.sort_values(by = ['Score'], ascending=True).iloc[2:12, 0:2]
    return df_generator.sort_values(by = ['Score'], ascending=True).iloc[2:7, 0:2]

In [38]:
search = 'Titanic'
for title in df.loc[df['Title'].str.contains(search), 'Title'].values:
    print(title)

Titanic (1997)
Titanic 2000 (1999)
Titanic (1943)


In [39]:
movie_generator('Titanic (1997)', 'Braveheart (1995)')

Unnamed: 0,Titanic (1997),Braveheart (1995)
5364,The Legend of 1900 (1998),The Legend of 1900 (1998)
3164,That Obscure Object of Desire (1977),That Obscure Object of Desire (1977)
5954,The Notebook (2004),The Notebook (2004)
288,Harold and Maude (1971),Harold and Maude (1971)
6306,Gypo (2005),Gypo (2005)
