### Clean the data from the Project

In [None]:
import duckdb

# some DuckDB setup
con = duckdb.connect(database=':memory:')
# enable automatic query parallelization
con.execute("PRAGMA threads=2")
# enable caching of parquet metadata
con.execute("PRAGMA enable_object_cache")

In [54]:
import numpy as np

# Load the provided project data into the duck database
all_training_data = con.execute('''
    SELECT 
        tconst,
        
        -- Clean up the movie title text. Remove excess whitespace, convert to lowercase, convert non-ascii to ascii equivalent, 
        -- remove everything that is non-alpanumeric or a space.
        REGEXP_REPLACE(TRANSLATE(LOWER(TRIM(primaryTitle)), 'áàãäåæßçéèêíîïñòóôöøớúûüý','aaaaaabceeeiiinoooooouuuy'),'[^a-zA-Z0-9 ]','','g') AS pTitle,
        REGEXP_REPLACE(TRANSLATE(LOWER(TRIM(originalTitle)),'áàãäåæßçéèêíîïñòóôöøớúûüý','aaaaaabceeeiiinoooooouuuy'),'[^a-zA-Z0-9 ]','','g') AS oTitle,
        
        -- Flag for indicating changed_title
        CASE
            WHEN pTitle == oTitle OR oTitle is NULL THEN 0
            ELSE 1
        END AS changed_title,
        
        -- Count number of words in title
        LENGTH(pTitle) - LENGTH(REPLACE(pTitle, ' ', '')) + 1 AS n_words,
        
        -- Merge start year and end year into single column
        CASE
            WHEN startYear LIKE '%N%' THEN endYear
            ELSE startYear
        END AS Year,
        runtimeMinutes,
        numVotes,
        
        -- Convert True/False to 1/0
        CAST(label AS INT) as label
    FROM 'data/train-[1-8].csv'
''').df()

# Replace empty (\N) with nans
all_training_data = all_training_data.replace('\\N', np.nan)

all_training_data

Unnamed: 0,tconst,pTitle,oTitle,changed_title,n_words,Year,runtimeMinutes,numVotes,label
0,tt0010600,the doll,die puppe,1,2,1919,66,1898.0,1
1,tt0011841,way down east,way down east,0,3,1920,145,5376.0,1
2,tt0012494,destiny,der mude tod,1,1,1921,97,5842.0,1
3,tt0015163,the navigator,the navigator,0,2,1924,59,9652.0,1
4,tt0016220,the phantom of the opera,the phantom of the opera,0,5,1925,93,17887.0,1
...,...,...,...,...,...,...,...,...,...
7954,tt9625664,trauma center,,0,2,2019,87,12951.0,0
7955,tt9741310,slaxx,slaxx,0,1,2020,77,2464.0,0
7956,tt9742392,kindred,kindred,0,1,2020,101,1719.0,0
7957,tt9850386,the bee gees how can you mend a broken heart,,0,10,2020,111,4144.0,1


In [12]:
import os
import json
import pyarrow as pa
import pyarrow.csv
import pyarrow.json
import pyarrow.parquet as pq

project_dir = os.getcwd()
data_dir = project_dir + "/data"

In [13]:
directing_filepath = project_dir + "/data/directing.json"
with open(directing_filepath, 'r') as json_file:
    json_data = json.load(json_file)
movies = json_data['movie'].values()
directors = json_data['director'].values()
movies = pa.array(movies)
directors = pa.array(directors)
names = ['movie', 'director']
table = pa.Table.from_arrays([movies, directors], names=names)

In [14]:
pq.write_table(table, project_dir+"/data_parquet/directing.parquet")

In [15]:
directing_filepath = project_dir + "/data/writing.json"
with open(directing_filepath, 'r') as json_file:
    json_data = json.load(json_file)
movies = [entry['movie'] for entry in json_data]
writers = [entry['writer'] for entry in json_data]
movies = pa.array(movies)
writers = pa.array(writers)
names = ['movie', 'writer']
table = pa.Table.from_arrays([movies, writers], names=names)
pq.write_table(table, project_dir + "/data_parquet/writing.parquet")

In [16]:
directing = "'data_parquet/directing.parquet'"
writing = "'data_parquet/writing.parquet'"

In [17]:
query = f"""
        select 
            movie
            , director
        from {directing}
"""

In [18]:
table_directing = con.execute(query).df()
table_directing

Unnamed: 0,movie,director
0,tt0003740,nm0665163
1,tt0008663,nm0803705
2,tt0009369,nm0428059
3,tt0009369,nm0949648
4,tt0010307,nm0304098
...,...,...
11157,tt9850344,nm0284774
11158,tt9850386,nm0550881
11159,tt9900782,nm7992231
11160,tt9904802,nm0052054


In [19]:
query = f"""
        select 
            movie
            , writer
        from {writing}
"""

In [20]:
table_writing = con.execute(query).df()
table_writing

Unnamed: 0,movie,writer
0,tt0003740,nm0195339
1,tt0003740,nm0515385
2,tt0003740,nm0665163
3,tt0003740,nm0758215
4,tt0008663,nm0406585
...,...,...
22423,tt9904802,nm0942647
22424,tt9904802,nm3853396
22425,tt9911196,nm2063122
22426,tt9911196,nm0277932


### Identify the key and check if there is any duplicated key of each dataset

In [38]:
import pandas as pd

print("Original size of the training dataset ", all_training_data.shape)
all_training_data = all_training_data.drop_duplicates(subset=['tconst','Year'])
print("New size of the dataset after dropping duplicated key ",all_training_data.shape)

Original size of the dataset  (7959, 9)
New size of the dataset after dropping duplicated key  (7959, 9)


In [40]:
print("Original size of the writer dataset ", table_writing.shape)
table_writing = table_writing.drop_duplicates(table_writing.columns)
print("New size of the writer dataset after dropping duplicated key ",table_writing.shape)

Original size of the writer dataset  (22428, 3)
New size of the writer dataset after dropping duplicated key  (22428, 3)


In [42]:
print("Original size of the writer dataset ", table_directing.shape)
table_directing = table_directing.drop_duplicates(table_directing.columns)
print("New size of the writer dataset after dropping duplicated key ",table_directing.shape)

Original size of the writer dataset  (11162, 3)
New size of the writer dataset after dropping duplicated key  (11162, 3)


In [23]:
#!pip install scipy



### Outlier detection - quantitative - numeric features

In [86]:
from scipy.stats import trim_mean

numeric_columns = ['Year', 'runtimeMinutes', 'numVotes']
for column in numeric_columns:
    all_training_data[column] = pd.to_numeric(all_training_data[column])
    temp_cleaned = all_training_data.dropna(subset=[column])
    
    trimmed_mean_year_start = trim_mean(temp_cleaned[column], proportiontocut=0)
    trimmed_mean_year_end = trim_mean(temp_cleaned[column], proportiontocut=0.01)
    
    if ((trimmed_mean_year_start / trimmed_mean_year_end - 1) > 0.1):
        print("Potential outlier in ", column)
        std_dev = np.std(temp_cleaned[column])
        outliers_mask = (temp_cleaned[column] > (trimmed_mean_year_end + 3 * std_dev))
        
        if outliers_mask.any():
            temp_cleaned.loc[outliers_mask, column] = np.nan
            print("Overwrite the suspected outliers with NaN!")

Potential outlier in  numVotes
Overwrite the suspected outliers with NaN!


In [87]:
outliers_mask[outliers_mask == True]

143     True
164     True
252     True
367     True
394     True
        ... 
7571    True
7575    True
7602    True
7667    True
7759    True
Name: numVotes, Length: 123, dtype: bool

The mean and trimmed-mean (trim 1% of the data) of Year and runtimeMinutes columns are very close, so no outlier for these two columns

### Missing handling imputation within data integration phase using supervised ML

Merge the training dataset with writer, directer and wikipedia datasets

In [90]:
# Calculate the number of films worked on previously by the writers and directors
# Does not include the current film

# Load json files and drop empty rows
directors = pd.read_json('data/directing.json').replace('\\N', np.nan).dropna()
writers = pd.read_json('data/writing.json').replace('\\N', np.nan).dropna()

crew_experience = con.execute('''
    SELECT directors.movie, writer, director,
    GREATEST(0, COUNT(DISTINCT writers.movie) OVER(PARTITION BY writer ORDER BY writers.movie)-1) AS writer_experience,
    GREATEST(0, COUNT(DISTINCT directors.movie) OVER(PARTITION BY director ORDER BY directors.movie)-1) AS director_experience,
    FROM directors
    FULL OUTER JOIN writers ON directors.movie == writers.movie
    ORDER BY writers.movie
''').df()

In [89]:
wiki_data_cleaned_ohe = pd.read_csv('wiki_data_clean.csv', index_col = False)
wiki_data_cleaned_ohe.drop(columns = ['Unnamed: 0'], inplace=True)
wiki_data_cleaned_ohe.columns

Index(['tconst', 'pTitle', 'Year', 'runtimeMinutes', 'numVotes', 'wiki_key',
       'Runtime_minutes', 'action', 'adventure', 'animation', 'biographical',
       'comedy', 'crime', 'drama', 'documentary', 'fantasy', 'historical',
       'horror', 'musical', 'mystery', 'romance', 'science_fiction', 'sport',
       'thriller', 'war', 'western', 'academy_award', 'bafta', 'United_States',
       'India', 'United_Kingdom', 'Canada', 'English', 'French', 'Japanese',
       'Italian', 'Spanish', 'Budget', 'Budget_cleaned', 'Box_office',
       'Box_office_cleaned', 'profit_pc', 'missing_finances'],
      dtype='object')

In [94]:
common_columns = all_training_data.columns.intersection(wiki_data_cleaned_ohe.columns).tolist()
common_columns

['tconst', 'pTitle', 'Year', 'runtimeMinutes', 'numVotes']

In [183]:
combined_all_data = pd.merge(all_training_data, wiki_data_cleaned_ohe, how = 'left', on = 'tconst')

In [184]:
for column in common_columns:
    if (column in all_training_data) and (column != 'tconst'):
        combined_all_data[column+'_x'] = combined_all_data[column+'_x'].fillna(combined_all_data[column+'_y'])

In [185]:
# Clean the title after the merge
columns_to_drop = [col for col in combined_all_data.columns if col.endswith('_y')]
combined_all_data = combined_all_data.drop(columns = columns_to_drop)

In [186]:
combined_all_data = combined_all_data.rename(columns = {col: col.rstrip('_x') for col in combined_all_data.columns if col.endswith('_x')})

In [187]:
combined_all_data

Unnamed: 0,tconst,pTitle,oTitle,changed_title,n_words,Year,runtimeMinutes,numVotes,label,wiki_key,...,French,Japanese,Italian,Spanish,Budget,Budget_cleaned,Box_office,Box_office_cleaned,profit_pc,missing_finances
0,tt0010600,the doll,die puppe,1,2,1919,66.0,1898.0,1,The_Doll_(1919_film),...,0.0,0.0,0.0,0.0,,0.000000e+00,,0.0,0.000000,1.0
1,tt0011841,way down east,way down east,0,3,1920,145.0,5376.0,1,Way_Down_East,...,0.0,0.0,0.0,0.0,"""$800,000[1] or $635,000[2]""",8.000006e+11,"""$7,500,000[2]""",7500000.0,-99.999063,1.0
2,tt0012494,destiny,der mude tod,1,1,1921,97.0,5842.0,1,Destiny_(1921_film),...,0.0,0.0,0.0,0.0,,0.000000e+00,,0.0,0.000000,1.0
3,tt0015163,the navigator,the navigator,0,2,1924,59.0,9652.0,1,The_Navigator_(1924_film),...,0.0,0.0,0.0,0.0,"""$385,000""",3.850000e+05,"""$680,406""",680406.0,76.728831,0.0
4,tt0016220,the phantom of the opera,the phantom of the opera,0,5,1925,93.0,17887.0,1,The_Phantom_of_the_Opera_(1925_film),...,0.0,0.0,0.0,0.0,,0.000000e+00,"""$2 million\n$1 million (1929 Sound Re-release)""",21000000.0,0.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7954,tt9625664,trauma center,,0,2,2019,87.0,12951.0,0,Trauma_Center_(film),...,0.0,0.0,0.0,0.0,,0.000000e+00,"""$92,968[1]""",92968.0,0.000000,1.0
7955,tt9741310,slaxx,slaxx,0,1,2020,77.0,2464.0,0,Slaxx,...,0.0,0.0,0.0,0.0,,0.000000e+00,,0.0,0.000000,1.0
7956,tt9742392,kindred,kindred,0,1,2020,101.0,1719.0,0,Kindred_(film),...,0.0,0.0,0.0,0.0,,0.000000e+00,"""$8,921[2][3]""",8921.0,0.000000,1.0
7957,tt9850386,the bee gees how can you mend a broken heart,,0,10,2020,111.0,4144.0,1,,...,,,,,,,,,,


The missing data is MAR (missing at random - missingness depends on the other observed features). we use RandomForest to fill in the missing values.

In [188]:
combined_all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7959 entries, 0 to 7958
Data columns (total 47 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   tconst              7959 non-null   object 
 1   pTitle              7959 non-null   object 
 2   oTitle              3971 non-null   object 
 3   changed_title       7959 non-null   int32  
 4   n_words             7959 non-null   int64  
 5   Year                7959 non-null   int64  
 6   runtimeMinutes      7950 non-null   object 
 7   numVotes            7169 non-null   float64
 8   label               7959 non-null   int32  
 9   wiki_key            6723 non-null   object 
 10  Runtime_minutes     6498 non-null   float64
 11  action              6723 non-null   float64
 12  adventure           6723 non-null   float64
 13  animation           6723 non-null   float64
 14  biographical        6723 non-null   float64
 15  comedy              6723 non-null   float64
 16  crime 

In [189]:
missing_data_columns = combined_all_data.columns[combined_all_data.isnull().any()].tolist()
missing_data_columns

['oTitle',
 'runtimeMinutes',
 'numVotes',
 'wiki_key',
 'Runtime_minutes',
 'action',
 'adventure',
 'animation',
 'biographical',
 'comedy',
 'crime',
 'drama',
 'documentary',
 'fantasy',
 'historical',
 'horror',
 'musical',
 'mystery',
 'romance',
 'science_fiction',
 'sport',
 'thriller',
 'war',
 'western',
 'academy_award',
 'bafta',
 'United_States',
 'India',
 'United_Kingdom',
 'Canada',
 'English',
 'French',
 'Japanese',
 'Italian',
 'Spanish',
 'Budget',
 'Budget_cleaned',
 'Box_office',
 'Box_office_cleaned',
 'profit_pc',
 'missing_finances']

In [197]:
allowed_missing_columns = ['pTitle', 'oTitle','runtimeMinutes', 'wiki_key', 'Runtime_minutes', 'Budget', 'Box_office']

In [198]:
# List of columns to fill in missing values:
output_columns = [column for column in missing_data_columns if column not in allowed_missing_columns]

In [199]:
output_columns

['numVotes',
 'action',
 'adventure',
 'animation',
 'biographical',
 'comedy',
 'crime',
 'drama',
 'documentary',
 'fantasy',
 'historical',
 'horror',
 'musical',
 'mystery',
 'romance',
 'science_fiction',
 'sport',
 'thriller',
 'war',
 'western',
 'academy_award',
 'bafta',
 'United_States',
 'India',
 'United_Kingdom',
 'Canada',
 'English',
 'French',
 'Japanese',
 'Italian',
 'Spanish',
 'Budget_cleaned',
 'Box_office_cleaned',
 'profit_pc',
 'missing_finances']

In [193]:
# List of columns used to predict the missing value:
combined_all_data.columns

Index(['tconst', 'pTitle', 'oTitle', 'changed_title', 'n_words', 'Year',
       'runtimeMinutes', 'numVotes', 'label', 'wiki_key', 'Runtime_minutes',
       'action', 'adventure', 'animation', 'biographical', 'comedy', 'crime',
       'drama', 'documentary', 'fantasy', 'historical', 'horror', 'musical',
       'mystery', 'romance', 'science_fiction', 'sport', 'thriller', 'war',
       'western', 'academy_award', 'bafta', 'United_States', 'India',
       'United_Kingdom', 'Canada', 'English', 'French', 'Japanese', 'Italian',
       'Spanish', 'Budget', 'Budget_cleaned', 'Box_office',
       'Box_office_cleaned', 'profit_pc', 'missing_finances'],
      dtype='object')

In [201]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

for column in output_columns:
    excluded_features = ['tconst', 'pTitle', 'oTitle', 'runtimeMinutes', 'wiki_key', 'Runtime_minutes', 'Budget', 'Box_office', 'label', column]
    input_columns = [feature for feature in combined_all_data.columns if feature not in excluded_features]
    print("Imputing missing values for column ", column)
    df_with_feature = combined_all_data.dropna(subset=[column])
    df_with_feature = df_with_feature.select_dtypes(include=['float32', 'float64', 'int32', 'int64'])
    df_missing_feature = combined_all_data[combined_all_data[column].isnull()].drop(column, axis=1)
    df_missing_feature = df_missing_feature.select_dtypes(include=['float32', 'float64', 'int32', 'int64'])

    X = df_with_feature[input_columns].dropna()  # Features
    y = df_with_feature.loc[X.index, column]  # Target variable

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the Random Forest Regressor
    rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_regressor.fit(X_train, y_train)
    # Predict 'numVotes' for the missing values
    X_missing = df_missing_feature[input_columns].fillna(0)  # Ensure no missing features
    predicted_values = rf_regressor.predict(X_missing)
    df_missing_feature.loc[X_missing.index, column] = predicted_values

    # Combine the datasets back together
    combined_all_data[column] = combined_all_data[column].fillna(df_missing_feature[column])


Imputing missing values for column  numVotes
Imputing missing values for column  action
Imputing missing values for column  adventure
Imputing missing values for column  animation
Imputing missing values for column  biographical
Imputing missing values for column  comedy
Imputing missing values for column  crime
Imputing missing values for column  drama
Imputing missing values for column  documentary
Imputing missing values for column  fantasy
Imputing missing values for column  historical
Imputing missing values for column  horror
Imputing missing values for column  musical
Imputing missing values for column  mystery
Imputing missing values for column  romance
Imputing missing values for column  science_fiction
Imputing missing values for column  sport
Imputing missing values for column  thriller
Imputing missing values for column  war
Imputing missing values for column  western
Imputing missing values for column  academy_award
Imputing missing values for column  bafta
Imputing missin

In [202]:
combined_all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7959 entries, 0 to 7958
Data columns (total 47 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   tconst              7959 non-null   object 
 1   pTitle              7959 non-null   object 
 2   oTitle              3971 non-null   object 
 3   changed_title       7959 non-null   int32  
 4   n_words             7959 non-null   int64  
 5   Year                7959 non-null   int64  
 6   runtimeMinutes      7950 non-null   object 
 7   numVotes            7959 non-null   float64
 8   label               7959 non-null   int32  
 9   wiki_key            6723 non-null   object 
 10  Runtime_minutes     6498 non-null   float64
 11  action              7959 non-null   float64
 12  adventure           7959 non-null   float64
 13  animation           7959 non-null   float64
 14  biographical        7959 non-null   float64
 15  comedy              7959 non-null   float64
 16  crime 

In [203]:
combined_all_data

Unnamed: 0,tconst,pTitle,oTitle,changed_title,n_words,Year,runtimeMinutes,numVotes,label,wiki_key,...,French,Japanese,Italian,Spanish,Budget,Budget_cleaned,Box_office,Box_office_cleaned,profit_pc,missing_finances
0,tt0010600,the doll,die puppe,1,2,1919,66.0,1898.0,1,The_Doll_(1919_film),...,0.0,0.0,0.0,0.0,,0.000000e+00,,0.000000e+00,0.000000,1.0
1,tt0011841,way down east,way down east,0,3,1920,145.0,5376.0,1,Way_Down_East,...,0.0,0.0,0.0,0.0,"""$800,000[1] or $635,000[2]""",8.000006e+11,"""$7,500,000[2]""",7.500000e+06,-99.999063,1.0
2,tt0012494,destiny,der mude tod,1,1,1921,97.0,5842.0,1,Destiny_(1921_film),...,0.0,0.0,0.0,0.0,,0.000000e+00,,0.000000e+00,0.000000,1.0
3,tt0015163,the navigator,the navigator,0,2,1924,59.0,9652.0,1,The_Navigator_(1924_film),...,0.0,0.0,0.0,0.0,"""$385,000""",3.850000e+05,"""$680,406""",6.804060e+05,76.728831,0.0
4,tt0016220,the phantom of the opera,the phantom of the opera,0,5,1925,93.0,17887.0,1,The_Phantom_of_the_Opera_(1925_film),...,0.0,0.0,0.0,0.0,,0.000000e+00,"""$2 million\n$1 million (1929 Sound Re-release)""",2.100000e+07,0.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7954,tt9625664,trauma center,,0,2,2019,87.0,12951.0,0,Trauma_Center_(film),...,0.0,0.0,0.0,0.0,,0.000000e+00,"""$92,968[1]""",9.296800e+04,0.000000,1.0
7955,tt9741310,slaxx,slaxx,0,1,2020,77.0,2464.0,0,Slaxx,...,0.0,0.0,0.0,0.0,,0.000000e+00,,0.000000e+00,0.000000,1.0
7956,tt9742392,kindred,kindred,0,1,2020,101.0,1719.0,0,Kindred_(film),...,0.0,0.0,0.0,0.0,,0.000000e+00,"""$8,921[2][3]""",8.921000e+03,0.000000,1.0
7957,tt9850386,the bee gees how can you mend a broken heart,,0,10,2020,111.0,4144.0,1,,...,0.0,0.1,0.0,0.1,,2.903410e+06,,6.160590e+07,1676.811717,0.0
