# Example for feature one-hot encoding :)

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

from sklearn.decomposition import PCA

In [2]:
import sklearn

In [36]:
ratings = pd.read_csv('../../data/preprocessed/ratings_clean_std_0.csv', sep=',').rename(columns={'Unnamed: 0': 'rating_id'})
movies = pd.read_csv('../../data/preprocessed/movies_id_updated.csv')
actors = pd.read_csv('../../data/raw/actors.csv', sep=',')
countries = pd.read_csv('../../data/raw/countries.csv', sep=',')
directors = pd.read_csv('../../data/raw/directors.csv', sep=',')
genres = pd.read_csv('../../data/raw/genres.csv', sep=',')
locations = pd.read_csv('../../data/raw/locations.csv', sep=',')
movie_tags = pd.read_csv('../../data/raw/movie_tags.csv', sep=',')
tags = pd.read_csv('../../data/raw/tags.csv', sep=',')
omdb = pd.read_csv('../../data/preprocessed/omdb_cleaned.csv')

Define mapping

In [8]:
mapping = movies[['id', 'imdbID']].rename(columns={'id':'movieID'})

In [7]:
movies

Unnamed: 0,id,title,imdbID,spanishTitle,imdbPictureURL,year,rtID,rtPictureURL
0,1,Toy story,tt0114709,Toy story (juguetes),http://ia.media-imdb.com/images/M/MV5BMTMwNDU0...,1995.0,toy_story,http://content7.flixster.com/movie/10/93/63/10...
1,2,Jumanji,tt0113497,Jumanji,http://ia.media-imdb.com/images/M/MV5BMzM5NjE1...,1995.0,1068044-jumanji,http://content8.flixster.com/movie/56/79/73/56...
2,3,Grumpy Old Men,tt0107050,Dos viejos gruñones,http://ia.media-imdb.com/images/M/MV5BMTI5MTgy...,1993.0,grumpy_old_men,http://content6.flixster.com/movie/25/60/25602...
3,4,Waiting to Exhale,tt0114885,Esperando un respiro,http://ia.media-imdb.com/images/M/MV5BMTczMTMy...,1995.0,waiting_to_exhale,http://content9.flixster.com/movie/10/94/17/10...
4,5,Father of the Bride Part II,tt0113041,Vuelve el padre de la novia (Ahora también abu...,http://ia.media-imdb.com/images/M/MV5BMTg1NDc2...,1995.0,father_of_the_bride_part_ii,http://content8.flixster.com/movie/25/54/25542...
...,...,...,...,...,...,...,...,...
10192,65088,Bedtime Stories,tt0960731,Más allá de los sueños,http://ia.media-imdb.com/images/M/MV5BMjA5Njk5...,2008.0,bedtime_stories,http://content6.flixster.com/movie/10/94/33/10...
10193,65091,Manhattan Melodrama,tt0025464,El enemigo público número 1,http://ia.media-imdb.com/images/M/MV5BMTUyODE3...,1934.0,manhattan_melodrama,http://content9.flixster.com/movie/66/44/64/66...
10194,65126,Choke,tt1024715,Choke,http://ia.media-imdb.com/images/M/MV5BMTMxMDI4...,2008.0,choke,http://content6.flixster.com/movie/10/85/09/10...
10195,65130,Revolutionary Road,tt0959337,Revolutionary Road,http://ia.media-imdb.com/images/M/MV5BMTI2MzY2...,2008.0,revolutionary_road,http://content8.flixster.com/movie/10/88/40/10...


In [8]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 787541 entries, 0 to 787540
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  787541 non-null  int64  
 1   user_id     787541 non-null  int64  
 2   imdbID      787541 non-null  object 
 3   rating      787541 non-null  float64
dtypes: float64(1), int64(2), object(1)
memory usage: 24.0+ MB


Genres

In [12]:
len(genres['genre'].value_counts())

20

In [13]:
genres['genre'].unique()

array(['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Mystery', 'Sci-Fi', 'IMAX', 'Documentary', 'War', 'Musical',
       'Film-Noir', 'Western', 'Short'], dtype=object)

In [16]:
genres['movieID'].value_counts()

51709    8
2987     7
56152    7
46948    7
673      6
        ..
4988     1
7751     1
3653     1
3637     1
4286     1
Name: movieID, Length: 10197, dtype: int64

In [37]:
genres_grouped = genres.merge(mapping, on='movieID').groupby('imdbID')['genre'].apply(list).reset_index(name='genres')

In [38]:
genres_grouped = ratings.merge(genres_grouped, on='imdbID')

In [39]:
ratings

Unnamed: 0,rating_id,user_id,imdbID,rating
0,0,1264,tt0047034,3.5
1,1,213,tt0304141,2.5
2,2,593,tt0369436,3.0
3,3,609,tt1077258,4.0
4,4,1590,tt0052182,4.0
...,...,...,...,...
787536,812812,1032,tt0083530,3.0
787537,812813,99,tt0107798,3.0
787538,812814,333,tt0093857,3.0
787539,812815,49,tt0144168,3.0


In [40]:
mlb = MultiLabelBinarizer()

In [41]:
genres_encoded = mlb.fit_transform(genres_grouped['genres'])

In [42]:
genres_encoded.shape

(787541, 20)

In [43]:
genres_grouped = genres_grouped.join(pd.DataFrame(genres_encoded))

In [47]:
genres_grouped = genres_grouped.sort_values('rating_id').drop(columns={'genres'})

In [48]:
genres_grouped

Unnamed: 0,rating_id,user_id,imdbID,rating,0,1,2,3,4,5,...,10,11,12,13,14,15,16,17,18,19
0,0,1264,tt0047034,3.5,1,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
71,1,213,tt0304141,2.5,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
816,2,593,tt0369436,3.0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
824,3,609,tt1077258,4.0,1,1,0,0,1,1,...,1,0,0,1,0,0,0,1,0,0
970,4,1590,tt0052182,4.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
514716,812812,1032,tt0083530,3.0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
351470,812813,99,tt0107798,3.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
553466,812814,333,tt0093857,3.0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
238518,812815,49,tt0144168,3.0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


Actors

In [105]:
# select the x most prominent actors per movie
actors_selected = actors[actors['ranking']<3]

In [106]:
actors

Unnamed: 0,movieID,actorID,actorName,ranking
0,1,annie_potts,Annie Potts,10
1,1,bill_farmer,Bill Farmer,20
2,1,don_rickles,Don Rickles,3
3,1,erik_von_detten,Erik von Detten,13
4,1,greg-berg,Greg Berg,17
...,...,...,...,...
231737,65133,rik_mayall,Rik Mayall,6
231738,65133,rowan_atkinson,Rowan Atkinson,7
231739,65133,stephen_fry,Stephen Fry,8
231740,65133,tim_mcinnerny,Tim McInnerny,9


In [107]:
actors_selected[actors_selected['movieID']==2]

Unnamed: 0,movieID,actorID,actorName,ranking
27,2,bonnie_hunt,Bonnie Hunt,2
41,2,robin_williams,Robin Williams,1


In [108]:
# merge with imdbID, groupby imdbID and write the x most prominent actors as one entry per movie
actors_grouped = actors_selected.merge(mapping, on='movieID').groupby('imdbID')['actorID'].apply(list).reset_index(name='actors')

In [109]:
actors_grouped

Unnamed: 0,imdbID,actors
0,tt0000439,"[1125873-george_barnes, gilbert_m_anderson]"
1,tt0004972,"[henry_b_walthall, miriam_cooper]"
2,tt0006864,"[lilliangish, mae_marsh]"
3,tt0008133,"[charlie_chaplin, eric_campbell]"
4,tt0008395,"[ivan_mozzhukhin, olga_kondorova]"
...,...,...
9408,tt1229827,"[1195748-kevin_jonas, joe_jonas]"
9409,tt1233247,"[brigitte_lin, leslie_cheung]"
9410,tt1233381,"[hatice_aslan, yavuz-bingol]"
9411,tt1275532,"[bianca_beauchamp, richard_cardinal]"


In [110]:
# merge with ratings to get right order of entries
actors_grouped = ratings.merge(actors_grouped, on='imdbID').sort_values('rating_id')

In [113]:
actors_grouped = actors_grouped.to_numpy()

In [116]:
mlb = MultiLabelBinarizer()

In [115]:
actors_grouped[:,-1]

array([list(['akihiko_hirata', 'momoko_kochi']),
       list(['chicken_shack', 'stan_webb']),
       list(['reese_witherspoon', 'vince_vaughn']), ...,
       list(['curtis_armstrong', 'robert_carradine']),
       list(['1033789-michelle_williams', 'kirsten_dunst']),
       list(['catherine_keener', 'steve_carell'])], dtype=object)

In [119]:
actors_grouped[:,-1]

array([list(['akihiko_hirata', 'momoko_kochi']),
       list(['chicken_shack', 'stan_webb']),
       list(['reese_witherspoon', 'vince_vaughn']), ...,
       list(['curtis_armstrong', 'robert_carradine']),
       list(['1033789-michelle_williams', 'kirsten_dunst']),
       list(['catherine_keener', 'steve_carell'])], dtype=object)

In [120]:
mlb.fit_transform(actors_grouped[:,-1])

MemoryError: Unable to allocate 31.7 GiB for an array with shape (787233, 10814) and data type int32

In [None]:
le = preprocessing.LabelEncoder()
for i in range(0,13):
    data_as_array[:,i] = le.fit_transform(data_as_array[:,i])

In [64]:
actors_encoded = mlb.fit_transform(actors_grouped['actors'])

In [65]:
actors_encoded.shape

(9413, 11059)

In [66]:
actors_grouped = actors_grouped.join(pd.DataFrame(actors_encoded))

In [88]:
actors_grouped.sum()

imdbID    tt0000439tt0004972tt0006864tt0008133tt0008395t...
actors    [1125873-george_barnes, gilbert_m_anderson, he...
0                                                         1
1                                                         1
2                                                         1
                                ...                        
11054                                                     3
11055                                                     1
11056                                                     6
11057                                                     1
11058                                                     1
Length: 11061, dtype: object

In [None]:
dummy = actors_grouped.loc[:,(actors_grouped.loc[:,2:].sum()>1)]

In [96]:
actors_grouped.iloc[:,2:].sum()[actors_grouped.iloc[:,2:].sum()>1].index

Index([    7,    10,    13,    17,    18,    20,    22,    25,    27,    28,
       ...
       11009, 11010, 11015, 11016, 11018, 11027, 11040, 11046, 11054, 11056],
      dtype='object', length=2849)

In [None]:
actors_grouped

In [72]:
actors_grouped[actors_grouped['imdbID']=='tt1349938']['actors']

9412    [_0444681, _1203457, _1564072, _2636108, _2855...
Name: actors, dtype: object

Languages

In [33]:
omdb.columns

Index(['Title', 'Year', 'Rated', 'Runtime', 'Writer', 'Plot', 'Language',
       'imdbRating', 'imdbVotes', 'imdbID', 'Rotten Tomatoes', 'Metacritic',
       'Series', 'Released_season', 'Released_month', 'Released_day',
       'PG_Rating', 'Available_languages', 'Oscars_won', 'Oscars_nominated',
       'Golden_globe_won', 'Golden_globe_nominated'],
      dtype='object')

In [36]:
omdb_relevant = omdb[['Runtime', 'Language', 'imdbRating', 'imdbVotes', 'imdbID', 'Rotten Tomatoes', 'Metacritic',
       'Series', 'Released_month', 'PG_Rating', 'Available_languages', 'Oscars_won', 'Oscars_nominated',
       'Golden_globe_won', 'Golden_globe_nominated']]

In [40]:
omdb_relevant.isna().sum()

Runtime                      9
Language                     0
imdbRating                   4
imdbVotes                    5
imdbID                       0
Rotten Tomatoes           1237
Metacritic                4019
Series                       0
Released_month              56
PG_Rating                 2314
Available_languages          0
Oscars_won                   0
Oscars_nominated             0
Golden_globe_won             0
Golden_globe_nominated       0
dtype: int64

In [46]:
# fill nan with median values
omdb_relevant = omdb_relevant.fillna(omdb_relevant.median())

In [50]:
mlb = MultiLabelBinarizer()
languages = pd.DataFrame(mlb.fit_transform(omdb_relevant['Language']))

In [51]:
languages

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,64
0,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,1,1,0,0,1,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9413,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
9414,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
9415,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
9416,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [57]:
omdb_relevant = omdb_relevant.join(languages)

In [58]:
omdb_relevant = omdb_relevant.drop(columns={'Language'})

In [59]:
omdb_relevant

Unnamed: 0,Runtime,imdbRating,imdbVotes,imdbID,Rotten Tomatoes,Metacritic,Series,Released_month,PG_Rating,Available_languages,...,55,56,57,58,59,60,61,62,63,64
0,81.0,8.3,852896.0,tt0114709,10.0,9.5,0,11.0,0.0,1,...,0,1,0,0,0,0,0,0,0,0
1,104.0,7.0,294340.0,tt0113497,5.4,3.9,0,12.0,1.0,2,...,1,1,0,0,0,0,0,0,0,0
2,103.0,7.0,41401.0,tt0107050,6.3,5.3,0,12.0,2.0,1,...,0,1,0,0,0,0,0,0,0,0
3,124.0,5.9,9222.0,tt0114885,5.6,5.9,0,12.0,3.0,1,...,0,1,0,0,0,0,0,0,0,0
4,106.0,6.0,33005.0,tt0113041,4.8,4.9,0,12.0,1.0,1,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9413,99.0,6.0,86128.0,tt0960731,2.6,3.3,0,12.0,1.0,1,...,0,1,0,0,0,0,0,0,0,0
9414,93.0,7.2,3084.0,tt0025464,8.0,5.9,0,5.0,3.0,1,...,0,1,0,0,0,0,0,0,0,0
9415,92.0,6.4,31058.0,tt1024715,5.5,4.7,0,9.0,3.0,1,...,0,1,0,0,0,0,0,0,0,0
9416,33.0,7.7,13510.0,tt0212579,6.8,5.9,0,12.0,3.0,1,...,0,1,0,0,0,0,0,0,0,0
