# Imports & Settings


In [1]:
# Imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import re 
from math import isnan
import wandb
import random
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier
# utilities
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

import pickle
from pytorch_tabnet.tab_model import TabNetClassifier

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_seq_items', None)

In [2]:
# dtype={'type': str} prevents being confused with data type for large data sets
prep = pd.read_csv('data/prep.csv', index_col='id', dtype={'type': str})
test_prep = pd.read_csv('data/test_prepared.csv', index_col='id', dtype={'type': str})
train_prep = pd.read_csv('data/train_prepared.csv', index_col='id', dtype={'type': str})

# combine models via hard voting


In [3]:
xgb = XGBClassifier()
xgb.load_model('models/xg/xgboost_full.json')
rf = pickle.load(open('./models/rf/first_try', 'rb'))
nn = TabNetClassifier()
nn.load_model('models/nn/first_try.zip')
#emb 



In [4]:
data = train_prep.copy()

In [5]:
features = data.drop('type', axis=1)
labels = data.type
# at least xgboost cannot deal with string labels
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(labels)
labels = label_encoder.transform(labels)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=0)

In [7]:
results = pd.DataFrame()
results['id'] = X_test.index
results['type'] = y_test

results['rf'] = rf.predict(X_test)
results['nn'] = nn.predict(X_test.values)
results['xg'] = xgb.predict(X_test)



In [8]:
results.set_index('id', inplace=True)

In [11]:
results['words'] = [-2] * len(results)

In [12]:
results['emb'] = [-1] * len(results)

In [13]:
boost_emb = XGBClassifier()
boost_emb.load_model('models/nlp/xgboost.json')

df = pd.read_csv('data/train_curie.csv', index_col='id', dtype={'type': str})
df['curie_similarity'] = df.curie_similarity.apply(eval).apply(np.array)

features = list(df.curie_similarity.values)
labels = df.type
# at least xgboost cannot deal with string labels
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(labels)
labels = label_encoder.transform(labels)
X_train, emb_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=0)


In [14]:
pred = boost_emb.predict(features)

In [15]:
df['pred'] = pred

In [16]:
for index, item in df.iterrows():
        if index in results.index:
            results.loc[index].emb = item.pred
        

In [17]:
types = ['photo', 'photo negative', 'photographic negative, photographic negative', 'photographic material','digital image',\
'archaeological find',\
'graphics', 'drawing', 'design/drawing/sketch','caricature','slide',\
'poster','plan', 'paper','notes', 'document', 'certificate',\
'medal', 'coin', 'label/sign',\
'manuscript','script, song/vocal music', 'music sheet', 'musical instrument', 'manuscript, musical composition', 'manuscript, sheet music',\
'postcard', 'photo, postcard', 'letter, postcard',\
'letter','letter of honor/honorary address',\
       'seal', 'seal/imprint',\
        'printed notes', 'small print',\
        'packaging', 'crate/box',\
        'audio recording', 'telegram',\
       'invitation',  'calendar',\
       'book','magazines', 'album', 'newspaper', 'folder/booklet',\
       'country',\
       'bag', 'suit', 'doll', 'sheet/linen', 'dish/vessel','jewel', 'tape/ribbon',\
       'sculpture']

In [18]:
def filtering(text):
    pred = []
    for type in types:            
        if type in text:
            pred.append(type)
    if ('drawing' in text) or ('sketch' in text) or ('design' in text):
        pred.append('design/drawing/sketch')
    if len(pred) > 0:
        return pred[-1]
    else:
        return -2


In [19]:
df['filter'] = df.text_features.apply(filtering)

In [28]:
type_lookup = pd.read_csv('data/type_lookup.csv')

In [31]:
df['filter'] = df['filter'].apply(lambda x: type_lookup[type_lookup.english == x].index[0] if x != -2 else -2)

In [34]:
for index, item in df.iterrows():
        if index in results.index:
            results.loc[index].words = item['filter']

In [45]:
results.head()

Unnamed: 0_level_0,type,rf,nn,xg,prediction,words,emb
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2080460,37,37,37,37,37,-2,37
2773190,37,37,37,37,37,-2,-1
3195806,11,11,11,11,11,11,11
3671158,12,12,12,12,12,-2,12
4092912,43,43,43,43,43,-2,43


# evalaluate

In [43]:
from statistics import mode
def vote(predictions):
    if -1 in predictions: predictions.remove(-1)
    if -2 in predictions: predictions.remove(-2)
    return mode(predictions)

In [57]:
results['prediction'] = results.apply(lambda row: vote([row.xg,row.rf,row.words,row.emb]), axis=1)

In [58]:
accuracy_score(results.type, results.prediction)
#0.9038095238095238

0.9157142857142857