# Setup


In [1]:
from setup_general import *
from setup_embedding import *

In [2]:
type_indicators = {}
with open('data/type_indicators/type_ind_cut.txt', 'r') as f:
    for line in f:
        a = line.split('\'')
        type = a[1]
        indicators = a[2].split()
        type_indicators[type] = indicators
save_indicators = {}
with open('data/type_indicators/save_indicator.txt', 'r') as f:
    for line in f:
        a = line.split('\'')
        type = a[1]
        indicators = a[2].split()
        save_indicators[type] = indicators

In [3]:
# naive functions for type from text keywords

def filtering(text):
    pred = []
    for type in types:            
        if type in text:
            pred.append(type)
    if ('drawing' in text) or ('sketch' in text) or ('design' in text):
        pred.append('design/drawing/sketch')
    if len(pred) > 0:
        return pred[-1]
    else:
        return -1
    
def indicating(text):
    pred = []
    for type in types:
        for indicator in type_indicators[type]:
            if indicator in text:
                pred.append(type)
    if len(pred) > 0:
        return pred[-1]
    else:
        return -1

def save_indicating(text):
    pred = []
    for type in types:
        if type in save_indicators.keys():
            for indicator in save_indicators[type]:
                if indicator in text:
                    pred.append(type)
    if len(pred) > 0:
        return pred[-1]
    else:
        return -1


# combine models via class-probability combination (soft-voting)


In [4]:
# is the full ds used for submission?
full = False
# submit to 
sub_name = 'class_adding_best_kaggle.csv'

In [5]:
#define models to be used for testing use 03 for submission use full
import pickle
xgb = XGBClassifier()
xgb.load_model('models/xg/03_smote100.json')

rf = pickle.load(open('./models/rf/train_prep_full_best' , 'rb'))

boost_emb = XGBClassifier()
boost_emb.load_model('models/nlp/xgboost_03.json')

nn = TabNetClassifier()
nn.load_model('models/nn/tabnet_03.zip')



In [6]:
data = test_prep.copy() if full else train_prep.copy()

features = data.drop('type', axis=1)
labels = data.type

# at least xgboost cannot deal with string labels
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(labels)
labels = label_encoder.transform(labels)

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=0)
if full: X_test = features

In [7]:
results = pd.DataFrame()
results['id'] = X_test.index
results.set_index('id', inplace=True)
if not full: results['type'] = y_test

#results['rf'] = rf.predict(X_test)
results['nn'] = nn.predict(X_test.values)
#results['xg'] = xgb.predict_proba(X_test)

results['filter'] = [-1] * len(results)
results['indi'] = [-1] * len(results)
results['save'] = [-1] * len(results)
results['emb'] = [[-1]] * len(results)


results['xg'] = [[-1]] * len(results)
results['rf'] = [[-1]] * len(results)

In [8]:
for i,item in enumerate(xgb.predict_proba(X_test)):
    results['xg'].iloc[i] = np.array(item)

for i,item in enumerate(rf.predict_proba(X_test)):
    results['rf'].iloc[i] = np.array(item)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['xg'].iloc[i] = np.array(item)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results['rf'].iloc[i] = np.array(item)


In [9]:
text = test_curie.copy() if full else train_curie.copy()

features = list(text.curie_similarity.values)
labels = text.type

#text['pred'] = boost_emb.predict_proba(features)
text['pred'] = [[-1]] * len(features)

In [10]:
for i,item in enumerate(boost_emb.predict_proba(features)):
    text['pred'].iloc[i] = np.array(item)

In [11]:
# establish our own rules to determine type from text - eventually not beneficial
text['filter'] = text.text_features.apply(filtering) # check for the type in the text
text['indicating'] = text.text_features.apply(indicating) # check for other often occuring type indicating words
text['save'] = text.text_features.apply(save_indicating) # only check for words that (almost) only occur with a certain type

text['filter'] = text['filter'].apply(lambda x: type_lookup[type_lookup.english == x].index[0] if x != -1 else -1)
text['indicating'] = text['indicating'].apply(lambda x: type_lookup[type_lookup.english == x].index[0] if x != -1 else -1)
text['save'] = text['save'].apply(lambda x: type_lookup[type_lookup.english == x].index[0] if x != -1 else -1)

for index, item in text.iterrows():
        if index in results.index:
            results.at[index, 'filter'] = item['filter']
            results.at[index, 'indi'] = item['indicating']
            results.at[index, 'save'] = item['save']
            results.at[index, 'emb'] = item['pred']

# evalaluate

In [12]:
# from previous approach for hard voting on predicted classes
"""
from statistics import mode
def vote(predictions):
    if predictions[0] != -1:
        return predictions[0]
    if -1 in predictions: predictions.remove(-1)
    return mode(predictions)

results['prediction'] = results.apply(lambda row: vote([row.save,row.xg,row.rf,row.emb]), axis=1)
"""

"\nfrom statistics import mode\ndef vote(predictions):\n    if predictions[0] != -1:\n        return predictions[0]\n    if -1 in predictions: predictions.remove(-1)\n    return mode(predictions)\n\nresults['prediction'] = results.apply(lambda row: vote([row.save,row.xg,row.rf,row.emb]), axis=1)\n"

In [13]:
from operator import add
def vote(preds):
    xg = np.array(preds[0])
    emb = np.array(preds[1])
    if emb[0] != -1:
        res = xg + emb
    else:
        res = xg
    return np.argmax(res)

In [14]:
results['prediction'] = results.apply(lambda row: vote([row.xg,row.emb]), axis=1)
if not full:
    print(accuracy_score(results.type, results.prediction))
    print(classification_report(results.type, results.prediction))

0.9783333333333334
              precision    recall  f1-score   support

           0       1.00      0.92      0.96        13
           1       1.00      1.00      1.00       240
           2       1.00      1.00      1.00        20
           3       1.00      1.00      1.00         1
           4       0.98      0.99      0.99       162
           5       1.00      1.00      1.00        11
           6       1.00      0.94      0.97        17
           7       1.00      1.00      1.00         2
           8       1.00      1.00      1.00        52
           9       1.00      0.86      0.92         7
          10       1.00      1.00      1.00         3
          11       1.00      0.98      0.99       270
          12       1.00      1.00      1.00        57
          13       1.00      1.00      1.00        12
          14       0.95      0.89      0.92       125
          15       1.00      1.00      1.00         3
          16       1.00      0.95      0.97        20
        

# submission

In [15]:
if full:
    submission = pd.DataFrame({'id': results.index ,'type': results.prediction})
    submission = submission.replace(type_lookup.id.to_list(), type_lookup.estonian.to_list())
    submission.to_csv(f'submissions/{sub_name}', index=False)