In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [2]:
actors = pd.read_csv('../../data/raw/actors.csv', sep=',')
omdb = pd.read_csv('../../data/preprocessed/omdb_cleaned.csv')
movies = pd.read_csv('../../data/preprocessed/movies_id_updated.csv')
#actors = actors.drop(columns=['ranking'])
mapping = movies[['id', 'imdbID']].rename(columns={'id':'movieID'})

In [3]:
len(actors['actorID'].value_counts())

95321

In [4]:
actor_counts = pd.DataFrame(actors['actorID'].value_counts())

In [5]:
actors

Unnamed: 0,movieID,actorID,actorName,ranking
0,1,annie_potts,Annie Potts,10
1,1,bill_farmer,Bill Farmer,20
2,1,don_rickles,Don Rickles,3
3,1,erik_von_detten,Erik von Detten,13
4,1,greg-berg,Greg Berg,17
...,...,...,...,...
231737,65133,rik_mayall,Rik Mayall,6
231738,65133,rowan_atkinson,Rowan Atkinson,7
231739,65133,stephen_fry,Stephen Fry,8
231740,65133,tim_mcinnerny,Tim McInnerny,9


In [6]:
actor_counts

Unnamed: 0,actorID
samuel_l_jackson,72
robert_de_niro,70
steve_buscemi,66
christopher_walken,64
robert_duvall,62
...,...
joe_cocker_and_the_grease_band,1
eddy_habbema,1
citizens_of_odessa,1
kemaya_kidwai,1


In [7]:
actors_selected = actor_counts[actor_counts['actorID']>14]

In [8]:
actors_selected

Unnamed: 0,actorID
samuel_l_jackson,72
robert_de_niro,70
steve_buscemi,66
christopher_walken,64
robert_duvall,62
...,...
penelope_wilton,15
don_brodie,15
mekhi_phifer,15
robert_blake,15


In [9]:
actors_selected = actors.set_index('actorID').loc[actors_selected.index].reset_index()

In [10]:
actors_selected

Unnamed: 0,actorID,movieID,actorName,ranking
0,samuel_l_jackson,165,Samuel L. Jackson,3
1,samuel_l_jackson,241,Samuel L. Jackson,5
2,samuel_l_jackson,259,Samuel L. Jackson,2
3,samuel_l_jackson,271,Samuel L. Jackson,6
4,samuel_l_jackson,296,Samuel L. Jackson,2
...,...,...,...,...
40365,nadim-sawalha,7205,Nadim Sawalha,8
40366,nadim-sawalha,32617,Nadim Sawalha,12
40367,nadim-sawalha,40583,Nadim Sawalha,17
40368,nadim-sawalha,44494,Nadim Sawalha,19


In [11]:
mapping

Unnamed: 0,movieID,imdbID
0,1,tt0114709
1,2,tt0113497
2,3,tt0107050
3,4,tt0114885
4,5,tt0113041
...,...,...
10192,65088,tt0960731
10193,65091,tt0025464
10194,65126,tt1024715
10195,65130,tt0959337


In [12]:
# merge with imdbID, groupby imdbID and write the x most prominent actors as one entry per movie
actors_grouped = actors_selected.merge(mapping, on='movieID').groupby('imdbID')['actorID'].apply(list).reset_index(name='actors')

In [13]:
actors_grouped

Unnamed: 0,imdbID,actors
0,tt0000439,[tom_london]
1,tt0004972,"[donald_crisp, monte_blue, mae_marsh]"
2,tt0006864,"[donald_crisp, monte_blue, mae_marsh]"
3,tt0009968,[donald_crisp]
4,tt0011237,[fritz_feld]
...,...,...
7351,tt1185834,"[samuel_l_jackson, 1008946-christopher_lee, ja..."
7352,tt1190617,"[dennis_hopper, james_woods, jon_voight, david..."
7353,tt1205489,"[clint_eastwood, john_carroll_lynch]"
7354,tt1213644,"[tony_cox, carmen_electra]"


In [14]:
mlb = MultiLabelBinarizer()

In [15]:
actors_enc = pd.DataFrame(mlb.fit_transform(actors_grouped['actors']))

In [16]:
actors_enc

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1757,1758,1759,1760,1761,1762,1763,1764,1765,1766
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7351,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7352,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7353,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7354,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
actors_grouped = actors_grouped.join(actors_enc)

In [18]:
actors_grouped

Unnamed: 0,imdbID,actors,0,1,2,3,4,5,6,7,...,1757,1758,1759,1760,1761,1762,1763,1764,1765,1766
0,tt0000439,[tom_london],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,tt0004972,"[donald_crisp, monte_blue, mae_marsh]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,tt0006864,"[donald_crisp, monte_blue, mae_marsh]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,tt0009968,[donald_crisp],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,tt0011237,[fritz_feld],0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7351,tt1185834,"[samuel_l_jackson, 1008946-christopher_lee, ja...",0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7352,tt1190617,"[dennis_hopper, james_woods, jon_voight, david...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7353,tt1205489,"[clint_eastwood, john_carroll_lynch]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7354,tt1213644,"[tony_cox, carmen_electra]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
