# Imports & Settings


In [2]:
# Imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from deep_translator import GoogleTranslator
import re 
from math import isnan
import wandb
import random
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb
from xgboost import XGBClassifier
# utilities
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_seq_items', None)

In [3]:
# dtype={'type': str} prevents being confused with data type for large data sets
train = pd.read_csv('data/train.csv', index_col='id', dtype={'type': str})
test = pd.read_csv('data/test.csv', index_col='id', dtype={'type': str})
train_translated = pd.read_csv('data/train_translated.csv', dtype={'type': str})
test_translated = pd.read_csv('data/test_translated.csv', index_col='id', dtype={'type': str})
combined_data = pd.read_csv('data/combined_data.csv', index_col='id', dtype={'type': str})
combined_data_translated = pd.read_csv('data/combined_data_translated.csv', index_col='id', dtype={'type': str})
combined_data_fully_translated = pd.read_csv('data/combined_data_fully_translated.csv', index_col='id', dtype={'type': str})
prep = pd.read_csv('data/prep.csv', index_col='id', dtype={'type': str})
test_prep = pd.read_csv('data/test_prepared.csv', index_col='id', dtype={'type': str})
train_prep = pd.read_csv('data/train_prepared.csv', index_col='id', dtype={'type': str})
train_prepROS = pd.read_csv('data/train_prepROS.csv')
train_prepSMOTE = pd.read_csv('data/train_prepSMOTE.csv')


# setup data for classifier

In [4]:
data = train_prepSMOTE.copy()
data.drop(columns=['Unnamed: 0'], inplace=True, axis = 1)

features = data.drop('type', axis=1)
labels = data.type
# at least xgboost cannot deal with string labels
#label_encoder = LabelEncoder()
#label_encoder = label_encoder.fit(labels)
#labels = label_encoder.transform(labels)

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=0)

# setup rf classifier

In [8]:
rfc = RandomForestClassifier(n_estimators=1000,max_depth=2000,max_features=None,min_samples_leaf=2,min_samples_split=2, random_state=42)
rfc.fit(features, labels)

y_pred = rfc.predict(X_test)
probs = rfc.predict_proba(X_test)
val_acc = accuracy_score(y_test, y_pred)

y_pred = rfc.predict(X_train)
probs = rfc.predict_proba(X_train)
train_acc = accuracy_score(y_train, y_pred)

print(train_acc, val_acc)
#0.9577482659448486 0.9236234458259325

# submission (taken from xgboost)

In [None]:
test_set = test_prep.drop('type', axis=1)
results = rfc.predict(test_set)
submission = pd.DataFrame({'id': test_set.index ,'type': rfc.predict(test_set)})
type_lookup = pd.read_csv('data/type_lookup.csv')
submission = submission.replace(type_lookup.id.to_list(), type_lookup.estonian.to_list())
submission.to_csv('submissions/submission_rf_full_SMOTE.csv', index=False)

# sweep

In [5]:
# Imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from deep_translator import GoogleTranslator
import re 
from math import isnan
import wandb
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

# Define sweep config
sweep_configuration = {
    'method': 'bayes',
    'name': 'sweep',
    'metric': {'goal': 'maximize', 'name': 'val_acc'},
    'parameters': 
    {
        'split': {'values': [2, 5, 7, 10, 12, 15, 20]},
        'depth': {'values': [3, 6, 10, 50, 100, 500, 1000, 2000]},
        'leaf': {'values': [2]},
        'estimators': {'values': [100, 200, 350, 500, 1000, 1500, 2000]},
        'features': {'values': [None]},

     }
}

project = 'rf_balanced'
# Initialize sweep by passing in config. (Optional) Provide a name of the project.
sweep_id = wandb.sweep(sweep=sweep_configuration, project=project)

def main():
    run = wandb.init(project=project)

    # note that we define values from `wandb.config` instead 
    # of defining hard values 
    split = wandb.config.split
    depth = wandb.config.depth
    leaf = wandb.config.leaf
    estimators = wandb.config.estimators
    feat = wandb.config.features
    

    # -------------------------- usual training code starts here  -------------------------------------
    
    rfc = RandomForestClassifier(n_estimators=estimators, max_depth=depth, min_samples_leaf=leaf, max_features=feat, min_samples_split=split, random_state=42)
    rfc.fit(X_train, y_train)

    y_pred = rfc.predict(X_test)
    val_acc = accuracy_score(y_test, y_pred)
    
    y_pred = rfc.predict(X_train)
    train_acc = accuracy_score(y_train, y_pred)

    print(train_acc, val_acc)

    # -------------------------- ends here  -------------------------------------
    

    wandb.log({
      'train_acc': train_acc,
      'val_acc': val_acc,
    })

# Start sweep job.
wandb.agent(sweep_id, function=main)

Create sweep with ID: 1apybvok
Sweep URL: https://wandb.ai/museum-item-classification/rf_balanced/sweeps/1apybvok


[34m[1mwandb[0m: Agent Starting Run: 76ap91h4 with config:
[34m[1mwandb[0m: 	depth: 10
[34m[1mwandb[0m: 	estimators: 500
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 5
[34m[1mwandb[0m: Currently logged in as: [33mtillwenke[0m ([33mmuseum-item-classification[0m). Use [1m`wandb login --relogin`[0m to force relogin


0.40843342919979697 0.392342609038879


0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.40843
val_acc,0.39234


[34m[1mwandb[0m: Agent Starting Run: e9l8998q with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 1000
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 2


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668609616681353, max=1.0…

0.9823210962612079 0.9385237813301757


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.98232
val_acc,0.93852


[34m[1mwandb[0m: Agent Starting Run: 6cn3xnel with config:
[34m[1mwandb[0m: 	depth: 1000
[34m[1mwandb[0m: 	estimators: 1500
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 2


0.9823210962612079 0.9383264258930334


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.98232
val_acc,0.93833


[34m[1mwandb[0m: Agent Starting Run: mc2jbhfe with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 1500
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 10


0.9732701742513957 0.9337872508387606


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.97327
val_acc,0.93379


[34m[1mwandb[0m: Agent Starting Run: p4ukul5h with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 1500
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 5


0.9805870411097953 0.9379317150187487


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.98059
val_acc,0.93793


[34m[1mwandb[0m: Agent Starting Run: 25bczs4k with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 2000
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 2


0.9822788022331247 0.9379317150187487


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.98228
val_acc,0.93793


[34m[1mwandb[0m: Agent Starting Run: 9wdvow12 with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 2000
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 20


0.9577482659448486 0.9236234458259325


0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.95775
val_acc,0.92362


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: jwsztkjv with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 2000
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 12


0.9701404161732363 0.9319123741859088


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.97014
val_acc,0.93191


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: fv4bxbgh with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 1500
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 20


0.957832854001015 0.9238208012630748


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.95783
val_acc,0.92382


[34m[1mwandb[0m: Agent Starting Run: 86lqmfvu with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 1500
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 20


0.957832854001015 0.9238208012630748


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.95783
val_acc,0.92382


[34m[1mwandb[0m: Agent Starting Run: u3v7cpzj with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 2000
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 20


0.9577482659448486 0.9236234458259325


0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.95775
val_acc,0.92362


[34m[1mwandb[0m: Agent Starting Run: m571vnr4 with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 2000
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 20


0.9577482659448486 0.9236234458259325


0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.95775
val_acc,0.92362


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: lqfktg0j with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 2000
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 5


0.9806716291659617 0.9374383264258931


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.98067
val_acc,0.93744


[34m[1mwandb[0m: Agent Starting Run: dkdovcb1 with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 1000
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 20


0.9580443241414313 0.9233274126702191


0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.95804
val_acc,0.92333


[34m[1mwandb[0m: Agent Starting Run: ukv8csl4 with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 2000
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 15


0.9653188969717476 0.9287546871916321


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.96532
val_acc,0.92875


[34m[1mwandb[0m: Agent Starting Run: zcucxrtd with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 2000
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 10


0.9732278802233124 0.934083283994474


0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.97323
val_acc,0.93408


[34m[1mwandb[0m: Agent Starting Run: 37rp1xmk with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 2000
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 12


0.9701404161732363 0.9319123741859088


0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.97014
val_acc,0.93191


[34m[1mwandb[0m: Agent Starting Run: pfz49rwr with config:
[34m[1mwandb[0m: 	depth: 1000
[34m[1mwandb[0m: 	estimators: 1500
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 20


0.957832854001015 0.9238208012630748


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.95783
val_acc,0.92382


[34m[1mwandb[0m: Agent Starting Run: swkzybgf with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 2000
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 12


0.9701404161732363 0.9319123741859088


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.97014
val_acc,0.93191


[34m[1mwandb[0m: Agent Starting Run: 7e9hka0j with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 2000
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 7


0.9779648113686348 0.9368462601144661


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.97796
val_acc,0.93685


[34m[1mwandb[0m: Agent Starting Run: y8997b08 with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 500
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 20


0.9579597360852647 0.9227353463587922


0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.95796
val_acc,0.92274


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 2wfuxh84 with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 2000
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 20


0.9577482659448486 0.9236234458259325


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.95775
val_acc,0.92362


[34m[1mwandb[0m: Agent Starting Run: ru4mrfmx with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 1500
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 12


0.9701404161732363 0.9316163410301954


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.97014
val_acc,0.93162


[34m[1mwandb[0m: Agent Starting Run: g1gylhcd with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 2000
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 10


0.9732278802233124 0.934083283994474


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.97323
val_acc,0.93408


[34m[1mwandb[0m: Agent Starting Run: chepr2t5 with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 2000
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 20


0.9577482659448486 0.9236234458259325


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.95775
val_acc,0.92362


[34m[1mwandb[0m: Agent Starting Run: 2tgem958 with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 2000
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 10


0.9732278802233124 0.934083283994474


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.97323
val_acc,0.93408


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 9a3zipkb with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 1500
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 20


0.957832854001015 0.9238208012630748


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.95783
val_acc,0.92382


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 6zws8jvf with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 1500
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 20


0.957832854001015 0.9238208012630748


0,1
train_acc,▁
val_acc,▁

0,1
train_acc,0.95783
val_acc,0.92382


[34m[1mwandb[0m: Agent Starting Run: 0j8i1tay with config:
[34m[1mwandb[0m: 	depth: 2000
[34m[1mwandb[0m: 	estimators: 2000
[34m[1mwandb[0m: 	features: None
[34m[1mwandb[0m: 	leaf: 2
[34m[1mwandb[0m: 	split: 20


[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


# save model

In [5]:
import pickle
pickle.dump(rfc, open('./models/rf/rf_0.7_SMOTE_best_sweep_no_crossval', 'wb'))
loaded_model = pickle.load(open('./models/rf/rf_full_SMOTE', 'rb'))

# submission from model

In [6]:
test_set = test_prep.drop('type', axis=1)

In [7]:
results = rfc.predict(test_set)

In [8]:
submission = pd.DataFrame({'id': test_set.index ,'type': rfc.predict(test_set)})

In [9]:
type_lookup = pd.read_csv('data/type_lookup.csv')

In [10]:
submission = submission.replace(type_lookup.id.to_list(), type_lookup.estonian.to_list())

In [11]:
submission.to_csv('submissions/submission_rf_full_SMOTE.csv', index=False)