In [17]:
import duckdb

# some DuckDB setup
con = duckdb.connect(database=':memory:')
# enable automatic query parallelization
con.execute("PRAGMA threads=2")
# enable caching of parquet metadata
con.execute("PRAGMA enable_object_cache")

<duckdb.duckdb.DuckDBPyConnection at 0x7f016006fa70>

In [27]:
import numpy as np

# Load the provided project data into the duck database
all_training_data = con.execute('''
    SELECT 
        tconst,
        
        -- Clean up the movie title text. Remove excess whitespace, convert to lowercase, convert non-ascii to ascii equivalent, 
        -- remove everything that is non-alpanumeric or a space.
        REGEXP_REPLACE(TRANSLATE(LOWER(TRIM(primaryTitle)), 'áàãäåæßçéèêíîïñòóôöøớúûüý','aaaaaabceeeiiinoooooouuuy'),'[^a-zA-Z0-9 ]','','g') AS pTitle,
        REGEXP_REPLACE(TRANSLATE(LOWER(TRIM(originalTitle)),'áàãäåæßçéèêíîïñòóôöøớúûüý','aaaaaabceeeiiinoooooouuuy'),'[^a-zA-Z0-9 ]','','g') AS oTitle,
        
        -- Flag for indicating changed_title
        CASE
            WHEN pTitle == oTitle OR oTitle is NULL THEN 0
            ELSE 1
        END AS changed_title,
        
        -- Count number of words in title
        LENGTH(pTitle) - LENGTH(REPLACE(pTitle, ' ', '')) + 1 AS n_words,
        
        -- Merge start year and end year into single column
        CASE
            WHEN startYear LIKE '%N%' THEN endYear
            ELSE startYear
        END AS Year,
        runtimeMinutes,
        numVotes,
        
        -- Convert True/False to 1/0
        CAST(label AS INT) as label
    FROM 'data/train-[1-8].csv'
''').df()

# Replace empty (\N) with nans
all_training_data = all_training_data.replace('\\N', np.nan)

all_training_data

Unnamed: 0,tconst,pTitle,oTitle,changed_title,n_words,Year,runtimeMinutes,numVotes,label
0,tt0010600,the doll,die puppe,1,2,1919,66,1898.0,1
1,tt0011841,way down east,way down east,0,3,1920,145,5376.0,1
2,tt0012494,destiny,der mude tod,1,1,1921,97,5842.0,1
3,tt0015163,the navigator,the navigator,0,2,1924,59,9652.0,1
4,tt0016220,the phantom of the opera,the phantom of the opera,0,5,1925,93,17887.0,1
...,...,...,...,...,...,...,...,...,...
7954,tt9625664,trauma center,,0,2,2019,87,12951.0,0
7955,tt9741310,slaxx,slaxx,0,1,2020,77,2464.0,0
7956,tt9742392,kindred,kindred,0,1,2020,101,1719.0,0
7957,tt9850386,the bee gees how can you mend a broken heart,,0,10,2020,111,4144.0,1


In [10]:
import os
import json
import pyarrow as pa
import pyarrow.csv
import pyarrow.json
import pyarrow.parquet as pq

project_dir = os.getcwd()
data_dir = project_dir + "/data"

In [11]:
directing_filepath = project_dir + "/data/directing.json"
with open(directing_filepath, 'r') as json_file:
    json_data = json.load(json_file)
movies = json_data['movie'].values()
directors = json_data['director'].values()
movies = pa.array(movies)
directors = pa.array(directors)
names = ['movie', 'director']
table = pa.Table.from_arrays([movies, directors], names=names)

In [12]:
pq.write_table(table, project_dir+"/data_parquet/directing.parquet")

In [13]:
directing_filepath = project_dir + "/data/writing.json"
with open(directing_filepath, 'r') as json_file:
    json_data = json.load(json_file)
movies = [entry['movie'] for entry in json_data]
writers = [entry['writer'] for entry in json_data]
movies = pa.array(movies)
writers = pa.array(writers)
names = ['movie', 'writer']
table = pa.Table.from_arrays([movies, writers], names=names)
pq.write_table(table, project_dir + "/data_parquet/writing.parquet")

In [14]:
directing = "'data_parquet/directing.parquet'"
writing = "'data_parquet/writing.parquet'"

In [19]:
query = f"""
        select 
            movie
            , director
        from {directing}
"""

In [44]:
table_directing = con.execute(query).df()
table_directing

Unnamed: 0,movie,writer
0,tt0003740,nm0195339
1,tt0003740,nm0515385
2,tt0003740,nm0665163
3,tt0003740,nm0758215
4,tt0008663,nm0406585
...,...,...
22423,tt9904802,nm0942647
22424,tt9904802,nm3853396
22425,tt9911196,nm2063122
22426,tt9911196,nm0277932


In [23]:
query = f"""
        select 
            movie
            , writer
        from {writing}
"""

In [46]:
table_writing = con.execute(query).df()
table_writing

Unnamed: 0,movie,writer
0,tt0003740,nm0195339
1,tt0003740,nm0515385
2,tt0003740,nm0665163
3,tt0003740,nm0758215
4,tt0008663,nm0406585
...,...,...
22423,tt9904802,nm0942647
22424,tt9904802,nm3853396
22425,tt9911196,nm2063122
22426,tt9911196,nm0277932


### Identify the key and check if there is any duplicated key (tconst) of all_training_data

In [32]:
import pandas as pd

all_training_data['is_tconst_duplicate'] = all_training_data.duplicated('tconst', keep=False)
print(all_training_data[all_training_data['is_tconst_duplicate']==True])

Empty DataFrame
Columns: [tconst, pTitle, oTitle, changed_title, n_words, Year, runtimeMinutes, numVotes, label, is_tconst_duplicate]
Index: []


What to do if there is a duplicated tconst?

In [23]:
#!pip install scipy



### Outlier detection - quantitative - numeric features

In [38]:
from scipy.stats import trim_mean

numeric_columns = ['Year', 'runtimeMinutes', 'numVotes']
for column in numeric_columns:
    all_training_data[column] = pd.to_numeric(all_training_data[column])
    temp_cleaned = all_training_data.dropna(subset=[column])
    
    trimmed_mean_year_start = trim_mean(temp_cleaned[column], proportiontocut=0)
    trimmed_mean_year_end = trim_mean(temp_cleaned[column], proportiontocut=0.01)
    median_year = temp_cleaned[column].median()
    if ((trimmed_mean_year_start / trimmed_mean_year_end - 1) > 0.01):
        print("Potential outlier in ", column)
        # Inspection using human-in-the-loop
        print(temp_cleaned[column].sort_values().head(int(len(temp_cleaned[column])*0.005)))
        print(temp_cleaned[column].sort_values().tail(int(len(temp_cleaned[column])*0.005)))
        

Potential outlier in  numVotes
7772    1001.0
5589    1001.0
642     1001.0
4864    1001.0
5598    1002.0
693     1002.0
619     1002.0
70      1002.0
5551    1002.0
131     1003.0
1412    1003.0
2183    1003.0
7367    1003.0
5402    1003.0
607     1004.0
6323    1004.0
5060    1005.0
5459    1005.0
4873    1006.0
6769    1006.0
7465    1006.0
2540    1006.0
3901    1006.0
3327    1006.0
918     1007.0
4551    1007.0
59      1007.0
6077    1007.0
7739    1007.0
3795    1007.0
2272    1008.0
7327    1008.0
7093    1009.0
6180    1009.0
1852    1009.0
Name: numVotes, dtype: float64
6268     773595.0
4692     776642.0
4334     781136.0
2463     787353.0
4146     788117.0
6441     794019.0
530      822486.0
2144     826119.0
3085     833476.0
4223     919997.0
2239     937386.0
1607     940920.0
804      970988.0
394      993387.0
3238     996577.0
4274    1002936.0
5888    1106539.0
3280    1107772.0
677     1107891.0
7169    1127305.0
7463    1167269.0
367     1180579.0
3543    1215368.0

The mean and trimmed-mean (trim 1% of the data) of Year and runtimeMinutes columns are very close, so no outlier for these two columns

Since the mean and trimmed-mean of numVotes column is large, we look at the top 0.5% and bottom 0.5% of this column to see if there is any outlier

It doesn't seem there is any outlier in this numVotes column

### Outlier detection - qualitative - text features

### Missing handling imputation using supervised ML

The missing data is MAR (missing at random - missingness depends on the other observed features). we use RandomForest to fill in the missing values.

In [56]:
all_training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7959 entries, 0 to 7958
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   tconst               7959 non-null   object 
 1   pTitle               7959 non-null   object 
 2   oTitle               3971 non-null   object 
 3   changed_title        7959 non-null   int32  
 4   n_words              7959 non-null   int64  
 5   Year                 7959 non-null   int64  
 6   runtimeMinutes       7946 non-null   float64
 7   numVotes             7169 non-null   float64
 8   label                7959 non-null   int32  
 9   is_tconst_duplicate  7959 non-null   bool   
dtypes: bool(1), float64(2), int32(2), int64(2), object(3)
memory usage: 505.3+ KB


runtimeMinutes and numVotes columns need to be imputed with missing data

In [84]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

input_columns = ['n_words', 'changed_title', 'Year']
output_columns = ['runtimeMinutes','numVotes']

for column in output_columns:
    df_with_feature = all_training_data.dropna(subset=[column])
    df_missing_feature = all_training_data[all_training_data[column].isnull()].drop(column, axis=1)

    X = df_with_feature[input_columns].dropna()  # Features
    y = df_with_feature.loc[X.index, column]  # Target variable

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the Random Forest Regressor
    rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_regressor.fit(X_train, y_train)
    # Predict 'numVotes' for the missing values
    X_missing = df_missing_feature[input_columns].dropna()  # Ensure no missing features
    predicted_values = rf_regressor.predict(X_missing)
    df_missing_feature.loc[X_missing.index, column] = predicted_values

    # Combine the datasets back together
    all_training_data = pd.concat([df_with_feature, df_missing_feature], sort=False)


In [85]:
all_training_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7959 entries, 0 to 2581
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   tconst               7959 non-null   object 
 1   pTitle               7959 non-null   object 
 2   oTitle               3971 non-null   object 
 3   changed_title        7959 non-null   int32  
 4   n_words              7959 non-null   int64  
 5   Year                 7959 non-null   int64  
 6   runtimeMinutes       7959 non-null   float64
 7   numVotes             7959 non-null   float64
 8   label                7959 non-null   int32  
 9   is_tconst_duplicate  7959 non-null   bool   
dtypes: bool(1), float64(2), int32(2), int64(2), object(3)
memory usage: 567.4+ KB


In [86]:
all_training_data

Unnamed: 0,tconst,pTitle,oTitle,changed_title,n_words,Year,runtimeMinutes,numVotes,label,is_tconst_duplicate
0,tt0010600,the doll,die puppe,1,2,1919,66.00000,1898.000000,1,False
1,tt0011841,way down east,way down east,0,3,1920,145.00000,5376.000000,1,False
2,tt0012494,destiny,der mude tod,1,1,1921,97.00000,5842.000000,1,False
3,tt0015163,the navigator,the navigator,0,2,1924,59.00000,9652.000000,1,False
4,tt0016220,the phantom of the opera,the phantom of the opera,0,5,1925,93.00000,17887.000000,1,False
...,...,...,...,...,...,...,...,...,...,...
7915,tt8017136,tony my mentor the serial killer,tony,1,6,2018,124.00000,5880.247968,1,False
7935,tt8671462,invoking 5,invoking 5,0,2,2018,90.00000,28610.448397,0,False
7936,tt8694228,mikhael,,0,1,2019,150.00000,38691.361520,0,False
7943,tt9110170,wrong turn,wrong turn,0,2,2021,109.00000,6588.851825,0,False
