## Importing Libraries:

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

## Importing Dataset:

In [2]:
df = pd.read_csv('movie_metadata.csv')

In [3]:
df.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


## Exploratory Data Analysis:

In [4]:
df.dropna(inplace = True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3756 entries, 0 to 5042
Data columns (total 28 columns):
color                        3756 non-null object
director_name                3756 non-null object
num_critic_for_reviews       3756 non-null float64
duration                     3756 non-null float64
director_facebook_likes      3756 non-null float64
actor_3_facebook_likes       3756 non-null float64
actor_2_name                 3756 non-null object
actor_1_facebook_likes       3756 non-null float64
gross                        3756 non-null float64
genres                       3756 non-null object
actor_1_name                 3756 non-null object
movie_title                  3756 non-null object
num_voted_users              3756 non-null int64
cast_total_facebook_likes    3756 non-null int64
actor_3_name                 3756 non-null object
facenumber_in_poster         3756 non-null float64
plot_keywords                3756 non-null object
movie_imdb_link              3756 non-

## Task # 01: Creating A Network

In [6]:
new_df = df[['actor_1_name', 'actor_2_name', 'actor_3_name', 'movie_title']]

In [7]:
new_df.head()

Unnamed: 0,actor_1_name,actor_2_name,actor_3_name,movie_title
0,CCH Pounder,Joel David Moore,Wes Studi,Avatar
1,Johnny Depp,Orlando Bloom,Jack Davenport,Pirates of the Caribbean: At World's End
2,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Spectre
3,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,The Dark Knight Rises
5,Daryl Sabara,Samantha Morton,Polly Walker,John Carter


In [8]:
new_df = new_df.set_index('movie_title')

In [9]:
new_df.head()

Unnamed: 0_level_0,actor_1_name,actor_2_name,actor_3_name
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Avatar,CCH Pounder,Joel David Moore,Wes Studi
Pirates of the Caribbean: At World's End,Johnny Depp,Orlando Bloom,Jack Davenport
Spectre,Christoph Waltz,Rory Kinnear,Stephanie Sigman
The Dark Knight Rises,Tom Hardy,Christian Bale,Joseph Gordon-Levitt
John Carter,Daryl Sabara,Samantha Morton,Polly Walker


In [10]:
comm_actors = []
for actor in list(new_df['actor_2_name']):
    if actor in list(new_df['actor_3_name']):
        comm_actors.append(actor)
    
print(comm_actors[:5])

print(len(comm_actors))

new_df.head(100)

['Joel David Moore', 'Orlando Bloom', 'Rory Kinnear', 'Samantha Morton', 'James Franco']
1991


Unnamed: 0_level_0,actor_1_name,actor_2_name,actor_3_name
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Avatar,CCH Pounder,Joel David Moore,Wes Studi
Pirates of the Caribbean: At World's End,Johnny Depp,Orlando Bloom,Jack Davenport
Spectre,Christoph Waltz,Rory Kinnear,Stephanie Sigman
The Dark Knight Rises,Tom Hardy,Christian Bale,Joseph Gordon-Levitt
John Carter,Daryl Sabara,Samantha Morton,Polly Walker
...,...,...,...
The Fast and the Furious,Paul Walker,Vin Diesel,Jordana Brewster
The Curious Case of Benjamin Button,Brad Pitt,Jason Flemyng,Julia Ormond
X-Men: First Class,Jennifer Lawrence,Michael Fassbender,Oliver Platt
The Hunger Games: Mockingjay - Part 2,Jennifer Lawrence,Philip Seymour Hoffman,Josh Hutcherson


In [11]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(new_df.values.flatten())

df_encoded = new_df.apply(le.fit_transform)

In [12]:
comm_actors = []
for actor in list(new_df['actor_1_name']):
    if actor in list(new_df['actor_3_name']):
        comm_actors.append(actor)
    
len(comm_actors)

1782

In [13]:
def dfToFloat():
    return df_encoded.apply(pd.to_numeric)

df_float = dfToFloat()
df_float.head()

Unnamed: 0_level_0,actor_1_name,actor_2_name,actor_3_name
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Avatar,189,1002,2529
Pirates of the Caribbean: At World's End,687,1592,1001
Spectre,258,1795,2296
The Dark Knight Rises,1340,381,1267
John Carter,323,1837,1967


In [14]:
movie_matrix = df_float.T
movie_matrix

movie_title,Avatar,Pirates of the Caribbean: At World's End,Spectre,The Dark Knight Rises,John Carter,Spider-Man 3,Tangled,Avengers: Age of Ultron,Harry Potter and the Half-Blood Prince,Batman v Superman: Dawn of Justice,...,Clerks,In the Company of Men,Sabotage,Slacker,Pink Flamingos,Clean,The Circle,Primer,El Mariachi,My Date with Drew
actor_1_name,189,687,258,1340,323,555,158,244,21,523,...,592,1262,982,1346,361,868,450,1233,198,665
actor_2_name,1002,1592,1795,381,1837,880,578,1758,469,1222,...,268,1400,1371,1730,1493,300,1522,518,1655,265
actor_3_name,2529,1001,2296,1267,1967,1414,1543,2211,2156,32,...,1120,1086,1650,1107,705,656,1771,368,490,1241


## Task # 02: Finding Sub-Networks

In [15]:
new_df = new_df.reset_index()

In [20]:
new_df.head()

mylist = list(new_df['movie_title'])
repeated = []

for x in mylist:
    if(mylist.count(x) > 1):
        repeated.append(x)
unmatched = 40
# print(len(repeated))
repeated_count = repeated[:40]

subnetwork = new_df.sample(40)

subnetwork1 = subnetwork[0:20]
subnetwork2 = subnetwork[21:40]

subnetwork1.drop(['actor_2_name', 'actor_3_name'], axis = 1, inplace = True)
subnetwork2.drop(['actor_2_name', 'actor_3_name'], axis = 1, inplace = True)

In [21]:
subnetwork1

Unnamed: 0,movie_title,actor_1_name
1172,The Lake House,Keanu Reeves
714,Jingle All the Way,Jim Belushi
1180,Medicine Man,Lorraine Bracco
1247,Disturbia,Sarah Roemer
979,Street Fighter: The Legend of Chun-Li,Chris Klein
2698,City of Life and Death,Ye Liu
205,Total Recall,Ronny Cox
3449,The Toxic Avenger Part II,Phoebe Legere
422,The Hunger Games,Jennifer Lawrence
1670,Brick Mansions,Paul Walker


In [22]:
subnetwork2

Unnamed: 0,movie_title,actor_1_name
114,Ratatouille,Janeane Garofalo
332,Rise of the Planet of the Apes,James Franco
1404,Knock Off,Paul Sorvino
617,Need for Speed,Rami Malek
2838,Romeo Is Bleeding,Gary Oldman
2938,Stand by Me,Marshall Bell
3712,Like Crazy,Jennifer Lawrence
1060,All the Pretty Horses,Matt Damon
1784,What a Girl Wants,Colin Firth
673,Blades of Glory,Will Ferrell


In [28]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(subnetwork1.values.flatten())

subnetwork1_encoded = subnetwork1.apply(le.fit_transform)

subnetwork1_encoded

Unnamed: 0,movie_title,actor_1_name
1172,11,10
714,6,9
1180,7,11
1247,2,18
979,8,2
2698,1,19
205,18,17
3449,15,16
422,10,8
1670,0,15


In [29]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(subnetwork1.values.flatten())

subnetwork2_encoded = subnetwork2.apply(le.fit_transform)

subnetwork2_encoded

Unnamed: 0,movie_title,actor_1_name
114,13,4
332,14,3
1404,10,12
617,12,13
2838,15,2
2938,16,9
3712,11,5
1060,1,10
1784,18,1
673,2,17


## Task # 03: Determining the Similarity:

There could be various __methods__ and __techniques__ to find the similarity between two __Subnetworks__. Some of them are listed below!

### Pearson Correlations: 

One way is to take pearons or pair-wise correlations betweeen the values of both the __Subnets__. Since the values need to be transformed or __label encoded__ which we have just done, and now it is jus the matter of built-in pandas method call to get the scores based on how much both the subnets are co-related!

### Euclidean Distance:

Another effective way is to take the __Euclidean__ distance between both the values of the subnets given by the formula: _Distance = [(y2 - y2) ** 2 + (x2 - x1) ** 2] ** 0.5_
This gives us the distance between both the networks which again shows how far they are, meaning, how similar or dissimilar they are!

### Cosine Similarity:

Another most powerful method to check the similarity betweek two networks or dataframs is __Cosine similarity method__. Unlike the above two methods, this doesn't give us a scalar value, instead it reports back the angle between two vectors. Lesser the angle more similarity between networks or vice-versa. This appraoch is mainly used when dealing with __vectors__ and not __scaler__ values.



## Task # 04: Interactive Similarity:

In [51]:
def dfToFloat():
    return subnetwork1_encoded.apply(pd.to_numeric)

sub1_float = dfToFloat()

In [52]:
def dfToFloat():
    return subnetwork2_encoded.apply(pd.to_numeric)

sub2_float = dfToFloat()

In [61]:
sub1_float['DataFrame Column'] = sub1_float['DataFrame Column'].astype(float)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20 entries, 1172 to 3004
Data columns (total 2 columns):
movie_title     20 non-null int64
actor_1_name    20 non-null int64
dtypes: int64(2)
memory usage: 1.1 KB
