In [4]:
import duckdb

# some DuckDB setup
con = duckdb.connect(database=':memory:')
# enable automatic query parallelization
con.execute("PRAGMA threads=2")
# enable caching of parquet metadata
con.execute("PRAGMA enable_object_cache")

<duckdb.duckdb.DuckDBPyConnection at 0x10491c8f0>

In [7]:
train_data = "'../data/train-[1-8].parquet'"

# 1. Create train and test split from the beginning for comparability

In [35]:
film_ids = con.execute(f"""
    select distinct tconst,
        label
    from {train_data}
    """).df()

In [36]:
from sklearn.model_selection import train_test_split

# Split training data in train and test split

ids = film_ids.drop('label', axis=1).values
labels = film_ids.loc[:, 'label'].values

# Small test size because cross-validation (and want to train on more data)
X_train_ids, X_test_ids, y_train, y_test = train_test_split(ids, labels, test_size=0.10, random_state=42, stratify=labels)


In [37]:
import pandas as pd
X_train_ids  = pd.DataFrame(X_train_ids, columns=['tconst'])
X_test_ids = pd.DataFrame(X_test_ids, columns=['label'])

X_train_ids.to_parquet("X_train_ids.parquet")
X_test_ids.to_parquet("X_test_ids.parquet")


In [56]:
ids_train = "'X_train_ids.parquet'"
ids_test = "'X_test_ids.parquet'"

# 2. Establish Baseline

In [94]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

### Dataset as it is

Dropping NaN in train set

In [87]:
# Load all the data into the duck database
baseline_data_train = con.execute(f"""
    with train as (
        select * 
        from {train_data}
    )
 select tconst, 
        try_cast(startYear as integer) as year,
        try_cast(runtimeMinutes as integer) as runtime, 
        try_cast(numVotes as integer) as votes,
        CASE
           WHEN originalTitle IS NULL THEN 0
           ELSE 1
        END AS ForeignFilm,
        -- Count number of words in title
        LENGTH(primaryTitle) - LENGTH(REPLACE(primaryTitle, ' ', '')) + 1 AS n_words,
        label
    from train
    where train.tconst in (select tconst from {ids_train})

""").df()

In [88]:
# Load all the data into the duck database
baseline_data_test = con.execute(f"""
    with train as (
        select * 
        from {train_data}
    )
    select tconst, 
        try_cast(startYear as integer) as year,
        try_cast(runtimeMinutes as integer) as runtime, 
        try_cast(numVotes as integer) as votes,
        CASE
           WHEN originalTitle IS NULL THEN 0
           ELSE 1
        END AS ForeignFilm,
        -- Count number of words in title
        LENGTH(primaryTitle) - LENGTH(REPLACE(primaryTitle, ' ', '')) + 1 AS n_words,
        label
    from train
    where train.tconst in (select tconst from {ids_test})

""").df()

In [95]:
baseline_data_train_noNAN = baseline_data_train.dropna()

In [96]:
X_train = baseline_data_train_noNAN.drop(['tconst',  'label'], axis=1).values
y_train = baseline_data_train.loc[:, 'label'].values

In [97]:
X_test = baseline_data_test.drop(['tconst',  'label'], axis=1).values
y_test = baseline_data_test.loc[:, 'label'].values

In [98]:
# Preprocessing pipeline
preprocessing = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])

# Base model pipelines
models = {
    'RandomForest': Pipeline(steps=[('preprocessing', preprocessing), ('classifier', RandomForestClassifier())])
}

hyperparameters = {
    'RandomForest': {
        'classifier__n_estimators': [50, 100, 150, 200],
        'classifier__max_depth': [1, 3, 5, 7, 10],
    }
}

best_models = {}
for model_name, model_pipeline in models.items():
    # Create a GridSearchCV object
    grid_search = GridSearchCV(model_pipeline, hyperparameters[model_name], cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

    # Fit the model
    grid_search.fit(X_train, y_train)

    # Store the best model
    best_models[model_name] = grid_search.best_estimator_

    # Print the best parameters and the best score
    print(f"{model_name} best parameters: {grid_search.best_params_}")
    print(f"{model_name} best score: {grid_search.best_score_:.4f}")

    # Use the best estimator to make predictions
    y_pred = grid_search.best_estimator_.predict(X_test)

    # Test if adjusting the threshold helps improve the accuracy (knowing dataset is balanced)
    y_proba = grid_search.best_estimator_.predict_proba(X_test)[:,1]
    threshold = np.median(y_proba)
    y_pred_th = (y_proba >= threshold).astype(int)

    # Calculate and print the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}% {accuracy_score(y_test, y_pred_th) * 100:.2f}%")

    # Print the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(conf_matrix)

    # Print the classification report
    class_report = classification_report(y_test, y_pred)
    print("Classification Report:")
    print(class_report)

    # See how balanced the predictions are
    print (y_train.mean(), y_test.mean(), y_pred.mean(), threshold)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
RandomForest best parameters: {'classifier__max_depth': 10, 'classifier__n_estimators': 150}
RandomForest best score: 0.7367
Accuracy: 79.09% 78.16%
Confusion Matrix:
[[3322  647]
 [1017 2973]]
Classification Report:
              precision    recall  f1-score   support

       False       0.77      0.84      0.80      3969
        True       0.82      0.75      0.78      3990

    accuracy                           0.79      7959
   macro avg       0.79      0.79      0.79      7959
weighted avg       0.79      0.79      0.79      7959

0.5060220233998624 0.5013192612137203 0.4548310089207187 0.43257028107990225


Keeping NaN and replacing and imputing them

In [100]:
X_train = baseline_data_train.drop(['tconst',  'label'], axis=1).values
y_train = baseline_data_train.loc[:, 'label'].values

In [101]:
X_test = baseline_data_test.drop(['tconst',  'label'], axis=1).values
y_test = baseline_data_test.loc[:, 'label'].values

In [102]:
# Preprocessing pipeline
preprocessing = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])

# Base model pipelines
models = {
    'RandomForest': Pipeline(steps=[('preprocessing', preprocessing), ('classifier', RandomForestClassifier())])
}

hyperparameters = {
    'RandomForest': {
        'classifier__n_estimators': [50, 100, 150, 200],
        'classifier__max_depth': [1, 3, 5, 7, 10],
    }
}

best_models = {}
for model_name, model_pipeline in models.items():
    # Create a GridSearchCV object
    grid_search = GridSearchCV(model_pipeline, hyperparameters[model_name], cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

    # Fit the model
    grid_search.fit(X_train, y_train)

    # Store the best model
    best_models[model_name] = grid_search.best_estimator_

    # Print the best parameters and the best score
    print(f"{model_name} best parameters: {grid_search.best_params_}")
    print(f"{model_name} best score: {grid_search.best_score_:.4f}")

    # Use the best estimator to make predictions
    y_pred = grid_search.best_estimator_.predict(X_test)

    # Test if adjusting the threshold helps improve the accuracy (knowing dataset is balanced)
    y_proba = grid_search.best_estimator_.predict_proba(X_test)[:,1]
    threshold = np.median(y_proba)
    y_pred_th = (y_proba >= threshold).astype(int)

    # Calculate and print the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}% {accuracy_score(y_test, y_pred_th) * 100:.2f}%")

    # Print the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(conf_matrix)

    # Print the classification report
    class_report = classification_report(y_test, y_pred)
    print("Classification Report:")
    print(class_report)

    # See how balanced the predictions are
    print (y_train.mean(), y_test.mean(), y_pred.mean(), threshold)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
RandomForest best parameters: {'classifier__max_depth': 10, 'classifier__n_estimators': 200}
RandomForest best score: 0.7244
Accuracy: 79.51% 78.49%
Confusion Matrix:
[[3424  545]
 [1086 2904]]
Classification Report:
              precision    recall  f1-score   support

       False       0.76      0.86      0.81      3969
        True       0.84      0.73      0.78      3990

    accuracy                           0.80      7959
   macro avg       0.80      0.80      0.79      7959
weighted avg       0.80      0.80      0.79      7959

0.5013262599469496 0.5013192612137203 0.433345897725845 0.41678257671050867


## Fixing the year issue

In [105]:
# Load all the data into the duck database
train_fixed_year = con.execute(f"""
    with train as (
        select * 
        from {train_data}
    )
 select tconst, 
        -- Merge start year and end year into single column
        CASE
            WHEN startYear LIKE '%N%' THEN endYear
            ELSE startYear
        END AS year,
        try_cast(runtimeMinutes as integer) as runtime, 
        try_cast(numVotes as integer) as votes,
        CASE
           WHEN originalTitle IS NULL THEN 0
           ELSE 1
        END AS ForeignFilm,
        -- Count number of words in title
        LENGTH(primaryTitle) - LENGTH(REPLACE(primaryTitle, ' ', '')) + 1 AS n_words,
        label
    from train
    where train.tconst in (select tconst from {ids_train})

""").df()

In [104]:
# Load all the data into the duck database
test_fixed_year = con.execute(f"""
    with train as (
        select * 
        from {train_data}
    )
    select tconst, 
        -- Merge start year and end year into single column
        CASE
            WHEN startYear LIKE '%N%' THEN endYear
            ELSE startYear
        END AS year,
        try_cast(runtimeMinutes as integer) as runtime, 
        try_cast(numVotes as integer) as votes,
        CASE
           WHEN originalTitle IS NULL THEN 0
           ELSE 1
        END AS ForeignFilm,
        -- Count number of words in title
        LENGTH(primaryTitle) - LENGTH(REPLACE(primaryTitle, ' ', '')) + 1 AS n_words,
        label
    from train
    where train.tconst in (select tconst from {ids_test})

""").df()

In [106]:
X_train = train_fixed_year.drop(['tconst',  'label'], axis=1).values
y_train = train_fixed_year.loc[:, 'label'].values

In [107]:
X_test = test_fixed_year.drop(['tconst',  'label'], axis=1).values
y_test = test_fixed_year.loc[:, 'label'].values

In [108]:
# Preprocessing pipeline
preprocessing = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])

# Base model pipelines
models = {
    'RandomForest': Pipeline(steps=[('preprocessing', preprocessing), ('classifier', RandomForestClassifier())])
}

hyperparameters = {
    'RandomForest': {
        'classifier__n_estimators': [50, 100, 150, 200],
        'classifier__max_depth': [1, 3, 5, 7, 10],
    }
}

best_models = {}
for model_name, model_pipeline in models.items():
    # Create a GridSearchCV object
    grid_search = GridSearchCV(model_pipeline, hyperparameters[model_name], cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

    # Fit the model
    grid_search.fit(X_train, y_train)

    # Store the best model
    best_models[model_name] = grid_search.best_estimator_

    # Print the best parameters and the best score
    print(f"{model_name} best parameters: {grid_search.best_params_}")
    print(f"{model_name} best score: {grid_search.best_score_:.4f}")

    # Use the best estimator to make predictions
    y_pred = grid_search.best_estimator_.predict(X_test)

    # Test if adjusting the threshold helps improve the accuracy (knowing dataset is balanced)
    y_proba = grid_search.best_estimator_.predict_proba(X_test)[:,1]
    threshold = np.median(y_proba)
    y_pred_th = (y_proba >= threshold).astype(int)

    # Calculate and print the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}% {accuracy_score(y_test, y_pred_th) * 100:.2f}%")

    # Print the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(conf_matrix)

    # Print the classification report
    class_report = classification_report(y_test, y_pred)
    print("Classification Report:")
    print(class_report)

    # See how balanced the predictions are
    print (y_train.mean(), y_test.mean(), y_pred.mean(), threshold)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
RandomForest best parameters: {'classifier__max_depth': 10, 'classifier__n_estimators': 100}
RandomForest best score: 0.7283
Accuracy: 80.37% 79.47%
Confusion Matrix:
[[3462  507]
 [1055 2935]]
Classification Report:
              precision    recall  f1-score   support

       False       0.77      0.87      0.82      3969
        True       0.85      0.74      0.79      3990

    accuracy                           0.80      7959
   macro avg       0.81      0.80      0.80      7959
weighted avg       0.81      0.80      0.80      7959

0.5013262599469496 0.5013192612137203 0.4324663902500314 0.4144205803399021


# Adding additional features

In [114]:
# Combine all films listed in the writer/director files with the year of the film 
# (from training, validation and test data).
movie_year = con.execute('''
    SELECT tconst,
        CASE
            WHEN startYear LIKE '%N%' THEN endYear
            ELSE startYear
        END AS Year,
    FROM '../data/train-[1-8].parquet'
    UNION
    SELECT tconst,
        CASE
            WHEN startYear LIKE '%N%' THEN endYear
            ELSE startYear
        END AS Year,
    FROM '../data/test_hidden.parquet'
    UNION
    SELECT tconst,
        CASE
            WHEN startYear LIKE '%N%' THEN endYear
            ELSE startYear
        END AS Year,
    FROM '../data/validation_hidden.parquet'
    ORDER BY Year, tconst
''').df()

Unnamed: 0,tconst,Year
0,tt0003740,1914
1,tt0008663,1917
2,tt0009369,1918
3,tt0010307,1919
4,tt0010600,1919
...,...,...
9995,tt9766294,2021
9996,tt9769668,2021
9997,tt9784798,2021
9998,tt9808510,2021


In [116]:
# Calculate the number of films worked on previously by the writers and directors
# Includes the current film

experience = con.execute('''
    with writers as (
        select *
        from '../data/writing.parquet'
    ), 
    directors as (
        select * 
        from '../data/directing.parquet'
    
    )
    SELECT tconst, Year, writer, director,
    COUNT(DISTINCT tconst) OVER(PARTITION BY writer ORDER BY Year, tconst) AS writer_experience,
    COUNT(DISTINCT tconst) OVER(PARTITION BY director ORDER BY Year, tconst) AS director_experience,
    FROM movie_year my
    LEFT JOIN writers ON writers.movie == my.tconst
    LEFT JOIN directors ON directors.movie == my.tconst
    ORDER BY Year, tconst
''').df()

Unnamed: 0,tconst,Year,writer,director,writer_experience,director_experience
0,tt0003740,1914,nm0195339,nm0665163,1,1
1,tt0003740,1914,nm0665163,nm0665163,1,1
2,tt0003740,1914,nm0758215,nm0665163,1,1
3,tt0003740,1914,nm0515385,nm0665163,1,1
4,tt0008663,1917,nm0803705,nm0803705,1,1
...,...,...,...,...,...,...
27883,tt9784798,2021,nm0077768,nm3489851,1,1
27884,tt9808510,2021,nm9925241,nm8904180,1,1
27885,tt9808510,2021,nm8904180,nm8904180,1,1
27886,tt9808510,2021,nm10260663,nm8904180,1,1


In [123]:
# Load all the data into the duck database
train_features = con.execute(f"""
    with train as (
        select * 
        from {train_data}
    )
 select tconst, 
        -- Merge start year and end year into single column
        CASE
            WHEN startYear LIKE '%N%' THEN endYear
            ELSE startYear
        END AS year,
        try_cast(runtimeMinutes as integer) as runtime, 
        try_cast(numVotes as integer) as votes,
        CASE
           WHEN originalTitle IS NULL THEN 0
           ELSE 1
        END AS ForeignFilm,
        -- Count number of words in title
        LENGTH(primaryTitle) - LENGTH(REPLACE(primaryTitle, ' ', '')) + 1 AS n_words,
        label
    from train
    where train.tconst in (select tconst from {ids_train})

""").df()

In [124]:
# Load all the data into the duck database
test_features = con.execute(f"""
    with train as (
        select * 
        from {train_data}
    )
    select tconst, 
        -- Merge start year and end year into single column
        CASE
            WHEN startYear LIKE '%N%' THEN endYear
            ELSE startYear
        END AS year,
        try_cast(runtimeMinutes as integer) as runtime, 
        try_cast(numVotes as integer) as votes,
        CASE
           WHEN originalTitle IS NULL THEN 0
           ELSE 1
        END AS ForeignFilm,
        -- Count number of words in title
        LENGTH(primaryTitle) - LENGTH(REPLACE(primaryTitle, ' ', '')) + 1 AS n_words,
        label
    from train
    where train.tconst in (select tconst from {ids_test})

""").df()

In [130]:
# Merge movie data with the experience level of the writers and directors.
# For each film, calculate the number of writers/directors, average experience and total experience

train_all_features = con.execute('''
    SELECT 
      ANY_VALUE(td.tconst) AS tconst, 
      ANY_VALUE(td.Year) AS Year, 
      ANY_VALUE(runtime) AS runtimeMinutes,
      ANY_VALUE(ForeignFilm) AS ForeignFilm, 
      ANY_VALUE(n_words) AS n_words, 
      ANY_VALUE(votes) AS numVotes,
      COUNT(DISTINCT writer) AS n_writers,
      AVG(DISTINCT e.writer_experience) AS avgexp_writers,
      SUM(DISTINCT e.writer_experience) AS totexp_writers,
      COUNT(DISTINCT director) AS n_directors,
      AVG(DISTINCT e.director_experience) AS avgexp_directors,
      SUM(DISTINCT e.director_experience) AS totexp_directors,
      ANY_VALUE(label) AS label
    FROM train_features td
    LEFT JOIN experience e ON e.tconst == td.tconst
    GROUP BY td.tconst
''').df()

In [131]:
# Merge movie data with the experience level of the writers and directors.
# For each film, calculate the number of writers/directors, average experience and total experience

test_all_features = con.execute('''
    SELECT 
      ANY_VALUE(td.tconst) AS tconst, 
      ANY_VALUE(td.Year) AS Year, 
      ANY_VALUE(runtime) AS runtimeMinutes,
      ANY_VALUE(ForeignFilm) AS ForeignFilm, 
      ANY_VALUE(n_words) AS n_words, 
      ANY_VALUE(votes) AS numVotes,
      COUNT(DISTINCT writer) AS n_writers,
      AVG(DISTINCT e.writer_experience) AS avgexp_writers,
      SUM(DISTINCT e.writer_experience) AS totexp_writers,
      COUNT(DISTINCT director) AS n_directors,
      AVG(DISTINCT e.director_experience) AS avgexp_directors,
      SUM(DISTINCT e.director_experience) AS totexp_directors,
      ANY_VALUE(label) AS label
    FROM test_features td
    LEFT JOIN experience e ON e.tconst == td.tconst
    GROUP BY td.tconst
''').df()

In [135]:
X_train = train_all_features.drop(['tconst',  'label'], axis=1).values
y_train = train_all_features.loc[:, 'label'].values

In [136]:
X_test = test_all_features.drop(['tconst',  'label'], axis=1).values
y_test = test_all_features.loc[:, 'label'].values

In [137]:
# Preprocessing pipeline
preprocessing = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])

# Base model pipelines
models = {
    'RandomForest': Pipeline(steps=[('preprocessing', preprocessing), ('classifier', RandomForestClassifier())])
}

hyperparameters = {
    'RandomForest': {
        'classifier__n_estimators': [50, 100, 150, 200],
        'classifier__max_depth': [1, 3, 5, 7, 10],
    }
}

best_models = {}
for model_name, model_pipeline in models.items():
    # Create a GridSearchCV object
    grid_search = GridSearchCV(model_pipeline, hyperparameters[model_name], cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

    # Fit the model
    grid_search.fit(X_train, y_train)

    # Store the best model
    best_models[model_name] = grid_search.best_estimator_

    # Print the best parameters and the best score
    print(f"{model_name} best parameters: {grid_search.best_params_}")
    print(f"{model_name} best score: {grid_search.best_score_:.4f}")

    # Use the best estimator to make predictions
    y_pred = grid_search.best_estimator_.predict(X_test)

    # Test if adjusting the threshold helps improve the accuracy (knowing dataset is balanced)
    y_proba = grid_search.best_estimator_.predict_proba(X_test)[:,1]
    threshold = np.median(y_proba)
    y_pred_th = (y_proba >= threshold).astype(int)

    # Calculate and print the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}% {accuracy_score(y_test, y_pred_th) * 100:.2f}%")

    # Print the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(conf_matrix)

    # Print the classification report
    class_report = classification_report(y_test, y_pred)
    print("Classification Report:")
    print(class_report)

    # See how balanced the predictions are
    print (y_train.mean(), y_test.mean(), y_pred.mean(), threshold)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
RandomForest best parameters: {'classifier__max_depth': 10, 'classifier__n_estimators': 200}
RandomForest best score: 0.7190
Accuracy: 81.38% 80.58%
Confusion Matrix:
[[3473  496]
 [ 986 3004]]
Classification Report:
              precision    recall  f1-score   support

       False       0.78      0.88      0.82      3969
        True       0.86      0.75      0.80      3990

    accuracy                           0.81      7959
   macro avg       0.82      0.81      0.81      7959
weighted avg       0.82      0.81      0.81      7959

0.5013262599469496 0.5013192612137203 0.4397537379067722 0.4174755437366656


**All features**

In [138]:
# Load all the data into the duck database
train_features = con.execute(f"""
    with train as (
        select * 
        from {train_data}
    )
 select tconst, 
    -- Clean up the movie title text. Remove excess whitespace, convert to lowercase, convert non-ascii to ascii equivalent, 
        -- remove everything that is non-alpanumeric or a space.
        REGEXP_REPLACE(TRANSLATE(LOWER(TRIM(primaryTitle)), 'áàãäåæßçéèêíîïñòóôöøớúûüý','aaaaaabceeeiiinoooooouuuy'),'[^a-zA-Z0-9 ]','','g') AS pTitle,
        REGEXP_REPLACE(TRANSLATE(LOWER(TRIM(originalTitle)),'áàãäåæßçéèêíîïñòóôöøớúûüý','aaaaaabceeeiiinoooooouuuy'),'[^a-zA-Z0-9 ]','','g') AS oTitle,
        
        -- Merge start year and end year into single column
        CASE
            WHEN startYear LIKE '%N%' THEN endYear
            ELSE startYear
        END AS year,
        try_cast(runtimeMinutes as integer) as runtime, 
        try_cast(numVotes as integer) as votes,
        CASE
           WHEN originalTitle IS NULL THEN 0
           ELSE 1
        END AS ForeignFilm,
        -- Count number of words in title
        LENGTH(primaryTitle) - LENGTH(REPLACE(primaryTitle, ' ', '')) + 1 AS n_words,
        label
    from train
    where train.tconst in (select tconst from {ids_train})

""").df()

In [139]:
# Load all the data into the duck database
test_features = con.execute(f"""
    with train as (
        select * 
        from {train_data}
    )
    select tconst, 
       -- Clean up the movie title text. Remove excess whitespace, convert to lowercase, convert non-ascii to ascii equivalent, 
        -- remove everything that is non-alpanumeric or a space.
        REGEXP_REPLACE(TRANSLATE(LOWER(TRIM(primaryTitle)), 'áàãäåæßçéèêíîïñòóôöøớúûüý','aaaaaabceeeiiinoooooouuuy'),'[^a-zA-Z0-9 ]','','g') AS pTitle,
        REGEXP_REPLACE(TRANSLATE(LOWER(TRIM(originalTitle)),'áàãäåæßçéèêíîïñòóôöøớúûüý','aaaaaabceeeiiinoooooouuuy'),'[^a-zA-Z0-9 ]','','g') AS oTitle,
        
        -- Merge start year and end year into single column
        CASE
            WHEN startYear LIKE '%N%' THEN endYear
            ELSE startYear
        END AS year,
        try_cast(runtimeMinutes as integer) as runtime, 
        try_cast(numVotes as integer) as votes,
        CASE
           WHEN originalTitle IS NULL THEN 0
           ELSE 1
        END AS ForeignFilm,
        -- Count number of words in title
        LENGTH(primaryTitle) - LENGTH(REPLACE(primaryTitle, ' ', '')) + 1 AS n_words,
        label
    from train
    where train.tconst in (select tconst from {ids_test})

""").df()

In [140]:
# Merge movie data with the experience level of the writers and directors.
# For each film, calculate the number of writers/directors, average experience and total experience

train_fixed_title = con.execute('''
    SELECT 
    
      ANY_VALUE(td.tconst) AS tconst, 
      ANY_VALUE(td.Year) AS Year, 
      ANY_VALUE(runtime) AS runtimeMinutes,
      ANY_VALUE(ForeignFilm) AS ForeignFilm, 
      ANY_VALUE(n_words) AS n_words, 
      ANY_VALUE(votes) AS numVotes,
      COUNT(DISTINCT writer) AS n_writers,
      AVG(DISTINCT e.writer_experience) AS avgexp_writers,
      SUM(DISTINCT e.writer_experience) AS totexp_writers,
      COUNT(DISTINCT director) AS n_directors,
      AVG(DISTINCT e.director_experience) AS avgexp_directors,
      SUM(DISTINCT e.director_experience) AS totexp_directors,
      ANY_VALUE(label) AS label
    FROM train_features td
    LEFT JOIN experience e ON e.tconst == td.tconst
    GROUP BY td.tconst
''').df()

In [141]:
# Merge movie data with the experience level of the writers and directors.
# For each film, calculate the number of writers/directors, average experience and total experience

test_fixed_title = con.execute('''
    SELECT 
      ANY_VALUE(td.tconst) AS tconst, 
      ANY_VALUE(td.Year) AS Year, 
      ANY_VALUE(runtime) AS runtimeMinutes,
      ANY_VALUE(ForeignFilm) AS ForeignFilm, 
      ANY_VALUE(n_words) AS n_words, 
      ANY_VALUE(votes) AS numVotes,
      COUNT(DISTINCT writer) AS n_writers,
      AVG(DISTINCT e.writer_experience) AS avgexp_writers,
      SUM(DISTINCT e.writer_experience) AS totexp_writers,
      COUNT(DISTINCT director) AS n_directors,
      AVG(DISTINCT e.director_experience) AS avgexp_directors,
      SUM(DISTINCT e.director_experience) AS totexp_directors,
      ANY_VALUE(label) AS label
    FROM test_features td
    LEFT JOIN experience e ON e.tconst == td.tconst
    GROUP BY td.tconst
''').df()

In [142]:
X_train = train_fixed_title.drop(['tconst',  'label'], axis=1).values
y_train = train_fixed_title.loc[:, 'label'].values

In [143]:
X_test = test_fixed_title.drop(['tconst',  'label'], axis=1).values
y_test = test_fixed_title.loc[:, 'label'].values

In [144]:
# Preprocessing pipeline
preprocessing = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
])

# Base model pipelines
models = {
    'RandomForest': Pipeline(steps=[('preprocessing', preprocessing), ('classifier', RandomForestClassifier())])
}

hyperparameters = {
    'RandomForest': {
        'classifier__n_estimators': [50, 100, 150, 200],
        'classifier__max_depth': [1, 3, 5, 7, 10],
    }
}

best_models = {}
for model_name, model_pipeline in models.items():
    # Create a GridSearchCV object
    grid_search = GridSearchCV(model_pipeline, hyperparameters[model_name], cv=5, scoring='accuracy', verbose=1, n_jobs=-1)

    # Fit the model
    grid_search.fit(X_train, y_train)

    # Store the best model
    best_models[model_name] = grid_search.best_estimator_

    # Print the best parameters and the best score
    print(f"{model_name} best parameters: {grid_search.best_params_}")
    print(f"{model_name} best score: {grid_search.best_score_:.4f}")

    # Use the best estimator to make predictions
    y_pred = grid_search.best_estimator_.predict(X_test)

    # Test if adjusting the threshold helps improve the accuracy (knowing dataset is balanced)
    y_proba = grid_search.best_estimator_.predict_proba(X_test)[:,1]
    threshold = np.median(y_proba)
    y_pred_th = (y_proba >= threshold).astype(int)

    # Calculate and print the accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}% {accuracy_score(y_test, y_pred_th) * 100:.2f}%")

    # Print the confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(conf_matrix)

    # Print the classification report
    class_report = classification_report(y_test, y_pred)
    print("Classification Report:")
    print(class_report)

    # See how balanced the predictions are
    print (y_train.mean(), y_test.mean(), y_pred.mean(), threshold)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
RandomForest best parameters: {'classifier__max_depth': 10, 'classifier__n_estimators': 150}
RandomForest best score: 0.7247
Accuracy: 81.45% 80.55%
Confusion Matrix:
[[3478  491]
 [ 985 3005]]
Classification Report:
              precision    recall  f1-score   support

       False       0.78      0.88      0.82      3969
        True       0.86      0.75      0.80      3990

    accuracy                           0.81      7959
   macro avg       0.82      0.81      0.81      7959
weighted avg       0.82      0.81      0.81      7959

0.5013262599469496 0.5013192612137203 0.43925116220630733 0.41210108169594517
