# Imports & Settings


In [2]:
# Imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import re 
from math import isnan
import wandb
import random
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier
# utilities
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

import pickle
from pytorch_tabnet.tab_model import TabNetClassifier

pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_seq_items', None)

In [3]:
# dtype={'type': str} prevents being confused with data type for large data sets
prep = pd.read_csv('data/prep.csv', index_col='id', dtype={'type': str})
test_prep = pd.read_csv('data/test_prepared.csv', index_col='id', dtype={'type': str})
train_prep = pd.read_csv('data/train_prepared.csv', index_col='id', dtype={'type': str})
train_curie = pd.read_csv('data/train_curie.csv', index_col='id', dtype={'type': str})
test_curie = pd.read_csv('data/test_curie.csv', index_col='id', dtype={'type': str})
train_curie['curie_similarity'] = train_curie.curie_similarity.apply(eval).apply(np.array)
test_curie['curie_similarity'] = test_curie.curie_similarity.apply(eval).apply(np.array)

In [4]:
type_lookup = pd.read_csv('data/type_lookup.csv')

In [5]:
type_indicators = {}
with open('data/type_ind_cut.txt', 'r') as f:
    for line in f:
        a = line.split('\'')
        type = a[1]
        indicators = a[2].split()
        type_indicators[type] = indicators
save_indicators = {}
with open('data/save_indicator.txt', 'r') as f:
    for line in f:
        a = line.split('\'')
        type = a[1]
        indicators = a[2].split()
        save_indicators[type] = indicators

In [6]:
types1 = ['photo', 'photo negative', 'photographic negative, photographic negative', 'photographic material','digital image',\
'archaeological find',\
'graphics', 'drawing', 'design/drawing/sketch','caricature','slide',\
'poster','plan', 'paper','notes', 'document', 'certificate',\
'medal', 'coin', 'label/sign',\
'manuscript','script, song/vocal music', 'music sheet', 'musical instrument', 'manuscript, musical composition', 'manuscript, sheet music',\
'postcard', 'photo, postcard', 'letter, postcard',\
'letter','letter of honor/honorary address',\
       'seal', 'seal/imprint',\
        'printed notes', 'small print',\
        'packaging', 'crate/box',\
        'audio recording', 'telegram',\
       'invitation',  'calendar',\
       'book','magazines', 'album', 'newspaper', 'folder/booklet',\
       'country',\
       'bag', 'suit', 'doll', 'sheet/linen', 'dish/vessel','jewel', 'tape/ribbon',\
       'sculpture']

In [7]:
# categorize types
types2 = [
'sculpture',\
'bag', 'suit', 'doll', 'sheet/linen', 'dish/vessel','jewel', 'tape/ribbon',\
'country',\
'paper','book','magazines', 'album', 'newspaper', 'folder/booklet',\
'invitation',  'calendar',\
'audio recording', 'telegram',\
'packaging', 'crate/box',\
'printed notes', 'small print',\
'seal', 'seal/imprint',\
'letter','letter of honor/honorary address',\
'postcard', 'photo, postcard', 'letter, postcard',\
'manuscript','script, song/vocal music', 'music sheet', 'musical instrument', 'manuscript, musical composition', 'manuscript, sheet music',\
'medal', 'coin', 'label/sign',\
'poster','plan','notes', 'document', 'certificate',\
'graphics', 'drawing', 'design/drawing/sketch','caricature','slide',\
'archaeological find',\
'photo', 'photo negative', 'photographic negative, photographic negative', 'photographic material','digital image'
]

In [8]:
types = types2

In [9]:
def filtering(text):
    pred = []
    for type in types:            
        if type in text:
            pred.append(type)
    if ('drawing' in text) or ('sketch' in text) or ('design' in text):
        pred.append('design/drawing/sketch')
    if len(pred) > 0:
        return pred[-1]
    else:
        return -1


In [10]:
def indicating(text):
    pred = []
    for type in types:
        for indicator in type_indicators[type]:
            if indicator in text:
                pred.append(type)
    if len(pred) > 0:
        return pred[-1]
    else:
        return -1

In [11]:
def save_indicating(text):
    pred = []
    for type in types:
        if type in save_indicators.keys():
            for indicator in save_indicators[type]:
                if indicator in text:
                    pred.append(type)
    if len(pred) > 0:
        return pred[-1]
    else:
        return -1

# combine models via hard voting


In [12]:
xgb = XGBClassifier()
# 'models/xg/xgboost_full.json' 'models/xg/xgboost_0.3.json' 'models/xg/full_smote100.json' 'models/xg/03_smote100.json'
xgb.load_model('models/xg/xgboost_full.json')

# './models/rf/train_prep_03' './models/rf/train_prep_full' './models/rf/train_prep_03_best' './models/rf/train_prep_full_best' './models/rf/smote_full_best' './models/rf/smote_03_best
rf = pickle.load(open('./models/rf/train_prep_full_best', 'rb'))

boost_emb = XGBClassifier()
# 'models/nlp/xgboost_full.json' 'models/nlp/xgboost_0.3.json'
boost_emb.load_model('models/nlp/xgboost_0.3.json')

nn = TabNetClassifier()
nn.load_model('models/nn/first_try.zip')



In [13]:
#data = train_prep.copy()
data = test_prep.copy()

features = data.drop('type', axis=1)
labels = data.type
"""
# at least xgboost cannot deal with string labels
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(labels)
labels = label_encoder.transform(labels)

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=0)
"""
X_test = data.drop('type', axis=1)

In [14]:
results = pd.DataFrame()
results['id'] = X_test.index
results.set_index('id', inplace=True)
#results['type'] = y_test

results['rf'] = rf.predict(X_test)
results['nn'] = nn.predict(X_test.values)
results['xg'] = xgb.predict(X_test)

results['filter'] = [-1] * len(results)
results['indi'] = [-1] * len(results)
results['save'] = [-1] * len(results)
results['emb'] = [-1] * len(results)

In [15]:
#df = train_curie.copy()
df = test_curie.copy()

features = list(df.curie_similarity.values)

labels = df.type
# at least xgboost cannot deal with string labels
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(labels)
labels = label_encoder.transform(labels)
X_train, emb_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=0)

pred = boost_emb.predict(features)
df['pred'] = pred

In [16]:
df['filter'] = df.text_features.apply(filtering)
df['indicating'] = df.text_features.apply(indicating)
df['save'] = df.text_features.apply(save_indicating)

In [17]:
df['filter'] = df['filter'].apply(lambda x: type_lookup[type_lookup.english == x].index[0] if x != -1 else -1)
df['indicating'] = df['indicating'].apply(lambda x: type_lookup[type_lookup.english == x].index[0] if x != -1 else -1)
df['save'] = df['save'].apply(lambda x: type_lookup[type_lookup.english == x].index[0] if x != -1 else -1)

In [18]:
for index, item in df.iterrows():
        if index in results.index:
            results.loc[index].filter = item['filter']
            results.loc[index].indi = item['indicating']
            results.loc[index].save = item['save']
            results.loc[index].emb = item['pred']

In [19]:
results.head()

Unnamed: 0_level_0,rf,nn,xg,filter,indi,save,emb
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2652198,36,36,36,-1,-1,-1,-1
3851731,22,22,22,-1,-1,-1,1
1211338,52,52,52,-1,8,-1,31
231244,21,21,21,-1,45,-1,50
2523607,8,8,8,-1,8,-1,8


# evalaluate

In [20]:
from statistics import mode
def vote(predictions):
    if predictions[0] != -1:
        return predictions[0]
    if -1 in predictions: predictions.remove(-1)
    return mode(predictions)

In [21]:
results['prediction'] = results.apply(lambda row: vote([row.save,row.xg,row.rf,row.emb,row.filter]), axis=1)

In [22]:
#accuracy_score(results.type, results.prediction)
#0.9038095238095238
#0.9157142857142857
#0.9130952380952381
#0.9145238095238095

#0.9233333333333333

#0.9516666666666667 doube emb smote normal xg rf
#0.9597619047619048 filter

#0.9564285714285714 doube emb smote smote xg rf

# submission

In [23]:
results.head()

Unnamed: 0_level_0,rf,nn,xg,filter,indi,save,emb,prediction
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2652198,36,36,36,-1,-1,-1,-1,36
3851731,22,22,22,-1,-1,-1,1,22
1211338,52,52,52,-1,8,-1,31,52
231244,21,21,21,-1,45,-1,50,21
2523607,8,8,8,-1,8,-1,8,8


In [24]:
submission = pd.DataFrame({'id': results.index ,'type': results.prediction})
submission = submission.replace(type_lookup.id.to_list(), type_lookup.estonian.to_list())
submission.to_csv('submissions/2.csv', index=False)