# Imports & Settings


In [1]:
# Imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from deep_translator import GoogleTranslator
import re 
from math import isnan
import wandb
import random
from sklearn.ensemble import RandomForestClassifier

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_seq_items', None)

In [2]:
# dtype={'type': str} prevents being confused with data type for large data sets
train = pd.read_csv('data/train.csv', index_col='id', dtype={'type': str})
test = pd.read_csv('data/test.csv', index_col='id', dtype={'type': str})
train_translated = pd.read_csv('data/train_translated.csv', dtype={'type': str})
test_translated = pd.read_csv('data/test_translated.csv', index_col='id', dtype={'type': str})
combined_data = pd.read_csv('data/combined_data.csv', index_col='id', dtype={'type': str})
combined_data_translated = pd.read_csv('data/combined_data_translated.csv', index_col='id', dtype={'type': str})
combined_data_fully_translated = pd.read_csv('data/combined_data_fully_translated.csv', index_col='id', dtype={'type': str})
prep = pd.read_csv('data/prep.csv', index_col='id', dtype={'type': str})
test_prep = pd.read_csv('data/test_prepared.csv', index_col='id', dtype={'type': str})
train_prep = pd.read_csv('data/train_prepared.csv', index_col='id', dtype={'type': str})

# xg boost


In [3]:
import xgboost as xgb
from xgboost import XGBClassifier
# utilities
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [4]:
data = train_prep.copy()

In [5]:
features = data.drop('type', axis=1)
labels = data.type
# at least xgboost cannot deal with string labels
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(labels)
labels = label_encoder.transform(labels)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.0001, random_state=0)

In [7]:
bst = XGBClassifier(random_state=0)
# fit model
bst.fit(X_train, y_train)
# make predictions
preds = bst.predict(X_test)

In [8]:
val_acc = accuracy_score(y_test, preds)
val_acc

0.9007142857142857

In [11]:
bst.save_model('models/xg/xgboost_full.json')

# submission from model

In [86]:
test_set = test_prep.drop('type', axis=1)
results = bst.predict(test_set)
submission = pd.DataFrame({'id': test_set.index ,'type': bst.predict(test_set)})
type_lookup = pd.read_csv('data/type_lookup.csv')
submission = submission.replace(type_lookup.id.to_list(), type_lookup.estonian.to_list())
submission.to_csv('data/submission_xgboost_full.csv', index=False)

# class name mapping

In [29]:
estonian = train.type.unique()

In [30]:
english = train_prep.type.unique()

In [48]:
type_lookup = pd.DataFrame({'estonian': estonian, 'english': english})

In [49]:
type_lookup = type_lookup.sort_values(by='english')

In [50]:
type_lookup.reset_index(drop=True, inplace=True)
type_lookup['id'] = type_lookup.index

In [52]:
type_lookup.to_csv('data/type_lookup.csv')