# Data Imputation and Feature Engineering

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from collections import Counter

In [3]:
import os 
import sys

In [4]:
from tqdm import tqdm

In [5]:
tqdm.pandas()

  from pandas import Panel


In [39]:
from pandarallel import pandarallel

In [40]:
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## Utils

In [10]:
def split_names(x):
    
    """
    A df function that will split a string of names for further processing
    """
    
    return [n for n in x.split(',') if (not pd.isna(x) and not x == '')]

## Data Imputation

The first step is to impute missing data from our datasets.
Feature to be imputed are:
- Genres
- runtime
- num votes


### Genres

In [11]:
title_rating = pd.read_csv('processed/title_rating.csv')

In [12]:
title_rating.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0000502,movie,Bohemios,Bohemios,0,1970-01-01 00:00:00.000001905,,100.0,,4.5,14
1,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1970-01-01 00:00:00.000001906,,70.0,"Action,Adventure,Biography",6.0,754
2,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1970-01-01 00:00:00.000001907,,90.0,Drama,4.6,17
3,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1970-01-01 00:00:00.000001907,,,Drama,4.5,23
4,tt0000630,movie,Hamlet,Amleto,0,1970-01-01 00:00:00.000001908,,,Drama,3.8,24


In [13]:
title_rating.genres.fillna('', inplace = True)

In [14]:
title_rating.genres.isna().sum()

0

We have 11,666 values that need to be imputed

**Imputation strategy**

One way to impute genre values is to consider the people working on that movie. Generally people tend to work on similar movies and we can use this hypothesis to impute genre values 

In [15]:
crew = pd.read_csv('processed/title_rating_crew.csv')
principal = pd.read_csv('processed/title_rating_principal.csv')

In [16]:
crew['directors'].replace({'\\N': ''}, inplace = True)
crew['writers'].replace({'\\N': ''}, inplace = True)
principal['nconst'].replace({'\\N': ''}, inplace = True)

In [17]:
names = pd.read_csv('processed/name_basics.csv')

In [18]:
crew

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,directors,writers
0,tt0000502,movie,Bohemios,Bohemios,0,1970-01-01 00:00:00.000001905,,100.0,,4.5,14,nm0063413,"nm0063413,nm0657268,nm0675388"
1,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1970-01-01 00:00:00.000001906,,70.0,"Action,Adventure,Biography",6.0,754,nm0846879,nm0846879
2,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1970-01-01 00:00:00.000001907,,90.0,Drama,4.6,17,nm0141150,nm0141150
3,tt0000615,movie,Robbery Under Arms,Robbery Under Arms,0,1970-01-01 00:00:00.000001907,,,Drama,4.5,23,nm0533958,"nm0092809,nm0533958"
4,tt0000630,movie,Hamlet,Amleto,0,1970-01-01 00:00:00.000001908,,,Drama,3.8,24,nm0143333,nm0000636
...,...,...,...,...,...,...,...,...,...,...,...,...,...
323829,tt9916362,movie,Coven,Akelarre,0,2020-01-01,,92.0,"Drama,History",6.4,4447,nm1893148,"nm1893148,nm3471432"
323830,tt9916428,movie,The Secret of China,Hong xing zhao yao Zhong guo,0,2019-01-01,,,"Adventure,History,War",3.8,14,nm0910951,
323831,tt9916460,tvMovie,Pink Taxi,Pink Taxi,0,2019-01-01,,,Comedy,9.3,17,nm7048843,"nm7048843,nm8691452"
323832,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019-01-01,,123.0,Drama,8.3,6,nm4457074,"nm4843252,nm4900525,nm2679404"


In [19]:
principal

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,ordering,nconst,category,job,characters
0,tt0000502,movie,Bohemios,Bohemios,0,1970-01-01 00:00:00.000001905,,100.0,,4.5,14,1,nm0215752,actor,\N,\N
1,tt0000502,movie,Bohemios,Bohemios,0,1970-01-01 00:00:00.000001905,,100.0,,4.5,14,2,nm0252720,actor,\N,\N
2,tt0000502,movie,Bohemios,Bohemios,0,1970-01-01 00:00:00.000001905,,100.0,,4.5,14,3,nm0063413,director,\N,\N
3,tt0000502,movie,Bohemios,Bohemios,0,1970-01-01 00:00:00.000001905,,100.0,,4.5,14,4,nm0657268,writer,\N,\N
4,tt0000502,movie,Bohemios,Bohemios,0,1970-01-01 00:00:00.000001905,,100.0,,4.5,14,5,nm0675388,writer,\N,\N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2885442,tt9916730,movie,6 Gunn,6 Gunn,0,2017-01-01,,116.0,,8.4,5,5,nm10538612,director,\N,\N
2885443,tt9916730,movie,6 Gunn,6 Gunn,0,2017-01-01,,116.0,,8.4,5,6,nm10538614,producer,producer,\N
2885444,tt9916730,movie,6 Gunn,6 Gunn,0,2017-01-01,,116.0,,8.4,5,7,nm10538613,producer,associate producer,\N
2885445,tt9916730,movie,6 Gunn,6 Gunn,0,2017-01-01,,116.0,,8.4,5,8,nm1957275,cinematographer,\N,\N


In [20]:
#creating some useful data stuctures for further processing
title_genre = title_rating[['tconst', 'genres']]

In [21]:
title_genre['genres'] = title_genre['genres'].apply(lambda x: split_names(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [22]:
title_genre_dict = title_genre.set_index('tconst').to_dict()['genres']

In [23]:
names['knownForTitles'].fillna('', inplace = True)

In [24]:
names = names[['nconst', 'knownForTitles']]

In [25]:
names['knownForTitles'] = names['knownForTitles'].apply(lambda x: split_names(x))


In [28]:
names

Unnamed: 0,nconst,knownForTitles
0,nm0000001,"[tt0050419, tt0031983, tt0072308, tt0053137]"
1,nm0000002,"[tt0037382, tt0038355, tt0071877, tt0117057]"
2,nm0000003,"[tt0057345, tt0056404, tt0054452, tt0049189]"
3,nm0000004,"[tt0080455, tt0078723, tt0072562, tt0077975]"
4,nm0000005,"[tt0050986, tt0083922, tt0060827, tt0069467]"
...,...,...
964582,nm9993616,[tt4844148]
964583,nm9993650,[tt8739208]
964584,nm9993690,[tt7888884]
964585,nm9993691,[tt7888884]


In [27]:
names['nconst'].dropna(inplace = True) 

In [29]:
names_title_dict = names.set_index('nconst').to_dict()['knownForTitles']

In [39]:
t = 'tt0000502'

In [51]:
people = set()

In [52]:
people.update(principal[principal['tconst'] == t]['nconst'].values)

In [54]:
people.update(split_names(crew[crew['tconst'] == t]['directors'].values[0]))
people.update(split_names(crew[crew['tconst'] == t]['writers'].values[0]))

In [55]:
people

{'nm0063413', 'nm0215752', 'nm0252720', 'nm0657268', 'nm0675388'}

In [67]:
related = set()
for p in people:
    print(names_title_dict[p])
    related.update(names_title_dict[p])

['tt0004406', 'tt0000941', 'tt0002886', 'tt0024495']
['tt0000502']
['tt0031112', 'tt0208070', 'tt0088954', 'tt0064102']
['tt0088954', 'tt0064102', 'tt0208070', 'tt0031112']
['tt0000502']


In [75]:
genres = []
for r in related:
    try:
        genres = genres+title_genre_dict[r]
    except:
        pass

In [77]:
genres

['Comedy', 'Musical', 'Drama', 'Comedy', 'Musical']

In [78]:
c = Counter(genres)

In [83]:
c.most_common(2)

[('Comedy', 2), ('Musical', 2)]

In [30]:
def impute_genre(title, names_title_dict, 
                 title_genre_dict, 
                 principal,
                 crew):
    people = set()
    
    people.update(principal[principal['tconst'] == title]['nconst'].values)
    people.update(split_names(crew[crew['tconst'] == title]['directors'].values[0]))
    people.update(split_names(crew[crew['tconst'] == title]['writers'].values[0]))
    
    related = set()
    print(people)
    for p in people:
#         print(names_title_dict[p])
        related.update(names_title_dict[p])
    
    genres = []
    for r in related:
        try:
            genres = genres+title_genre_dict[r]
        except:
            pass
        
    c = Counter(genres)
    comm = c.most_common(2)
#     print(comm)
    impute = []
    for item in comm:
        impute.append(item[0])
        
    return impute
    

In [35]:
impute_genre('tt0076430', names_title_dict, 
                 title_genre_dict, 
                 principal,
                 crew)

{'nm0718213', 'nm0029787', 'nm0005777', 'nm0541004', 'nm0402895', 'nm0802445', 'nm0352278', 'nm0116753', 'nm0081411', 'nm0036120'}


['Drama', 'Comedy']

In [None]:
title_rating['genres'] == title_rating.parallel_apply(lambda x: impute_genre(x['tconst'], names_title_dict, 
                                                                             title_genre_dict, 
                                                                             principal,
                                                                             crew) if x['genres'] == '' else x['genres'])