In [45]:
from readers.readers import DuckData
from unidecode import unidecode
from duckdb.typing import *
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd
from pathlib import Path

# 1. Convert to parquet for faster handling with DuckDb

In [2]:
duck = DuckData()
all_training_data = duck.all_training_data

# 2. Cleaning

In [3]:
# enable automatic query parallelization
duck.conn.execute("PRAGMA threads=2")
# enable caching of parquet metadata
duck.conn.execute("PRAGMA enable_object_cache")

<duckdb.duckdb.DuckDBPyConnection at 0x13ffbe9b0>

In [4]:
def convert_to_ascii(input_string):
    return unidecode(input_string)

In [5]:
duck.conn.create_function('to_ascii', convert_to_ascii, [VARCHAR], VARCHAR)

<duckdb.duckdb.DuckDBPyConnection at 0x13ffbe9b0>

In [6]:
clean_data = duck.conn.query(
    """
    select 
            try_cast(tconst as varchar) as tconst
            , try_cast(primaryTitle as varchar) as primaryTitle
            , try_cast(originalTitle as varchar) as originalTitle
            , try_cast(runtimeMinutes as integer) as runtimeMinutes
            , try_cast(numVotes as integer) as numVotes
            , to_ascii(primaryTitle) as title
            , try_cast(startYear as integer) as yearStart, 
            try_cast(endYear as integer) as yearEnd,
            case 
                when yearStart is not null and yearEnd is null then yearStart
                when yearStart is NULL and yearEnd is not NULL then yearEnd
                else yearStart
                end as year,
                label 
                , case
                    when director = '\\N' then null
                    else try_cast(director as varchar)
                    end as director
                , case
                    when writer = '\\N' then null
                    else try_cast(writer as varchar)
                    end as writer
            ,case
                when title != originalTitle and originalTitle != '' then True
                else False
            end as foreign_movie
            , case 
                when label = True then 1
                else 0
                end as label_int
        from all_training_data
    """
)

In [7]:
director_rating = duck.conn.query( 
        """
        select 
            director, 
            count(distinct tconst) as n_successes_dir
        from clean_data
        where label=True
        group by director
        """
    )

In [8]:
writer_rating  = duck.conn.query(
       """
        select writer, 
        count(distinct tconst) as n_successes_wri
        from clean_data
        where label=True
        group by writer
       """
    )

In [9]:
aggregated = duck.conn.query(
    """ 
    select 
        tconst
        ,title 
        ,year
        , foreign_movie
        , runtimeMinutes
        , numVotes
        ,listagg(distinct clean_data.director, ', ') as directors
        ,listagg(distinct clean_data.writer, ', ') as writers
        ,len(directors) as n_directors
        ,len(writers) as n_writers
        , len(title) as title_length
        ,label
        , sum(n_successes_dir) as n_successes_dir
        , sum(n_successes_wri) as n_successes_wri
    from clean_data
    left outer join director_rating on clean_data.director = director_rating.director
    left outer join writer_rating on clean_data.writer = writer_rating.writer
    group by tconst, title, year, foreign_movie, label, runtimeMinutes, numVotes 
"""
)

In [10]:
aggregated.df()

Unnamed: 0,tconst,title,year,foreign_movie,runtimeMinutes,numVotes,directors,writers,n_directors,n_writers,title_length,label,n_successes_dir,n_successes_wri
0,tt0010600,The Doll,1919,True,66.0,1898.0,nm0523932,"nm0006782, nm0523932, nm0932559, nm0473134",9.0,42.0,8,True,32.0,7.0
1,tt0011607,The Parson's Widow,1920,True,94.0,1264.0,nm0003433,"nm0418114, nm0003433",9.0,20.0,18,True,6.0,4.0
2,tt0014109,The Saga of Gosta Berling,1924,False,183.0,1231.0,nm0830249,"nm0481248, nm0405147, nm0830249",9.0,31.0,25,True,3.0,3.0
3,tt0014358,The Pilgrim,1923,False,47.0,4891.0,nm0000122,nm0000122,9.0,9.0,11,True,6.0,7.0
4,tt0014945,Girl Shy,1924,False,87.0,3327.0,"nm0628345, nm0853130","nm0928514, nm0924065, nm0516001, nm0853130, nm...",20.0,53.0,8,True,30.0,26.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7954,tt6774212,Aiyaary,2018,False,157.0,4552.0,nm3109770,nm3109770,9.0,9.0,7,False,,1.0
7955,tt6067752,Lucifer,2019,False,175.0,9086.0,nm1335387,nm1729382,9.0,9.0,7,True,1.0,1.0
7956,tt6472976,Five Feet Apart,2019,False,116.0,57187.0,nm1682573,"nm3037068, nm5657522",9.0,20.0,15,True,4.0,2.0
7957,tt0041386,The Fountainhead,1949,False,114.0,,nm0896542,nm0709446,9.0,9.0,16,True,7.0,2.0


In [11]:
non_agg = clean_data.df()

# 3. Get train set

In [12]:
train_data = duck.conn.query(
    f"""
    select 
        tconst
        , year
        , foreign_movie
        , runtimeMinutes
        , numVotes
        , title_length
        , label
        , case 
            when n_successes_dir is null then 0
            else n_successes_dir
            end as n_successes_dir
        , case 
            when n_successes_wri is null then 0
            else n_successes_wri
            end as n_successes_wri
    from aggregated
"""
).df()

In [13]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7959 entries, 0 to 7958
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   tconst           7959 non-null   object 
 1   year             7959 non-null   int32  
 2   foreign_movie    7959 non-null   bool   
 3   runtimeMinutes   7946 non-null   float64
 4   numVotes         7169 non-null   float64
 5   title_length     7959 non-null   int64  
 6   label            7959 non-null   bool   
 7   n_successes_dir  7959 non-null   float64
 8   n_successes_wri  7959 non-null   float64
dtypes: bool(2), float64(4), int32(1), int64(1), object(1)
memory usage: 419.8+ KB


In [14]:
df_train_data = train_data.dropna()

In [15]:
X = df_train_data.drop(['tconst', 'label'], axis=1).values
y = df_train_data.loc[:, 'label'].values

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
names = [
    "Nearest Neighbors",
    "Linear SVM",
    "Decision Tree",
    "Random Forest",
    "AdaBoost",
    "Logistic Regression",
]

In [18]:
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025, random_state=42),
    DecisionTreeClassifier(max_depth=5, random_state=42),
    RandomForestClassifier(
        max_depth=5, n_estimators=10, max_features=1, random_state=42
    ),
    AdaBoostClassifier(algorithm="SAMME", random_state=42),
    LogisticRegression(random_state=0),
]

In [19]:
datasets = [(X, y)]

In [20]:
# preprocess dataset, split into training and test part
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

for name, clf in zip(names, classifiers):
    print("Training ", name)

    clf = make_pipeline(StandardScaler(), clf)
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print(score)


Training  Nearest Neighbors
0.77645826056584
Training  Linear SVM
0.7963674467341949
Training  Decision Tree
0.9406217254628012
Training  Random Forest
0.9353824659448131
Training  AdaBoost
0.9406217254628012
Training  Logistic Regression
0.8281522878099895


In [21]:
clf = AdaBoostClassifier(algorithm="SAMME", random_state=42)
clf = make_pipeline(StandardScaler(), clf)
clf.fit(X_train, y_train)

# Predictions

In [26]:
validation = duck.validation_hidden_file
test = duck.testing_file
directing = duck.directing
writing = duck.writing

In [28]:
clean_test_data = duck.conn.query(
    f"""
        select 
            try_cast(tconst as varchar) as tconst
            , try_cast(primaryTitle as varchar) as primaryTitle
            , try_cast(originalTitle as varchar) as originalTitle
            , try_cast(runtimeMinutes as integer) as runtimeMinutes
            , try_cast(numVotes as integer) as numVotes
            , to_ascii(primaryTitle) as title
            , try_cast(startYear as integer) as yearStart, 
            try_cast(endYear as integer) as yearEnd,
            case 
                when yearStart is not null and yearEnd is null then yearStart
                when yearStart is NULL and yearEnd is not NULL then yearEnd
                else yearStart
                end as year
                , case
                    when director = '\\N' then null
                    else try_cast(director as varchar)
                    end as director
                , case
                    when writer = '\\N' then null
                    else try_cast(writer as varchar)
                    end as writer
            ,case
                when title != originalTitle and originalTitle != '' then True
                else False
            end as foreign_movie
        from test
        left outer join directing
        on test.tconst = directing.movie
        left outer join writing
        on test.tconst = writing.movie
"""
)

In [29]:
aggregated_test = duck.conn.query(
    """
    select 
            tconst
            ,title 
            ,year
            , foreign_movie
            , runtimeMinutes
            , numVotes
            ,listagg(distinct clean_test_data.director, ', ') as directors
            ,listagg(distinct clean_test_data.writer, ', ') as writers
            ,len(directors) as n_directors
            ,len(writers) as n_writers
            , len(title) as title_length
            --,label
            , sum(n_successes_dir) as n_successes_dir
            , sum(n_successes_wri) as n_successes_wri
        from clean_test_data
        left outer join director_rating on clean_test_data.director = director_rating.director
        left outer join writer_rating on clean_test_data.writer = writer_rating.writer
        group by tconst, title, year, foreign_movie, runtimeMinutes, numVotes
    """
)

In [41]:
test_data = duck.conn.query(
    """
    select 
        tconst
        , year
        , foreign_movie
        , runtimeMinutes
        , numVotes
        , title_length
        , case 
            when n_successes_dir is null then 0
            else n_successes_dir
            end as n_successes_dir
        , case 
            when n_successes_wri is null then 0
            else n_successes_wri
            end as n_successes_wri
    from aggregated_test
    """
)

In [42]:
test_data = test_data.df()
test_data

Unnamed: 0,tconst,year,foreign_movie,runtimeMinutes,numVotes,title_length,n_successes_dir,n_successes_wri
0,tt0050381,1958,False,104.0,2022.0,8,0.0,26.0
1,tt0051020,1958,False,75.0,1135.0,29,0.0,0.0
2,tt0053643,1960,False,105.0,1509.0,13,0.0,6.0
3,tt0054734,1961,False,98.0,1221.0,27,0.0,9.0
4,tt0055093,1961,False,90.0,6239.0,4,0.0,0.0
...,...,...,...,...,...,...,...,...
1081,tt2795078,2013,False,125.0,,15,1.0,1.0
1082,tt5849148,2016,False,124.0,1329.0,4,2.0,1.0
1083,tt7485048,2019,False,154.0,27714.0,8,1.0,2.0
1084,tt0036891,1944,False,101.0,4612.0,24,2.0,7.0


**imput missing values**

In [33]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1086 entries, 0 to 1085
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   tconst           1086 non-null   object 
 1   year             1086 non-null   int32  
 2   foreign_movie    1086 non-null   bool   
 3   runtimeMinutes   1085 non-null   float64
 4   numVotes         967 non-null    float64
 5   title_length     1086 non-null   int64  
 6   n_successes_dir  1086 non-null   float64
 7   n_successes_wri  1086 non-null   float64
dtypes: bool(1), float64(4), int32(1), int64(1), object(1)
memory usage: 56.3+ KB


In [35]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

In [36]:
X_test = test_data.drop('tconst', axis=1).values

In [37]:
X_test = imp_mean.fit_transform(X_test)

In [38]:
y_pred = clf.predict(X_test)

In [39]:
y_pred

array([False, False, False, ..., False,  True,  True])

In [44]:
predictions_folder = Path(".").resolve() / "predictions"

In [46]:
Y = pd.DataFrame(y_pred)
Y.to_csv( predictions_folder / "test_hidden_martin.txt", sep="\t", index=False)

In [47]:
Y = pd.DataFrame(y_pred)
Y.to_csv(predictions_folder / "validation_hidden_martin.txt", sep="\t", index=False)