In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from scipy.special import softmax
from sklearn.preprocessing import MinMaxScaler, minmax_scale

## VALID DATA

In [2]:
trainDataFull = pd.read_csv("trainData.csv")
trainDataFull.head(3)

Unnamed: 0,v1,v10,v100,v101,v102,v103,v11,v12,v13,v14,...,v91,v92,v93,v94,v95,v96,v97,v98,v99,target
0,1.4,0.0,0.2,1.0,4.2,0.4,0.0,0.0,0.0,1.2,...,0.6,0.2,0.0,3.2,1.0,0.2,0.0,1.6,0.4,9
1,0.0,0.0,0.0,2.8,0.0,0.8,0.0,0.2,1.2,1.4,...,0.0,0.0,1.2,0.0,1.2,0.2,0.2,2.6,2.2,6
2,0.0,0.0,0.0,0.4,0.0,0.6,0.8,0.0,0.0,0.2,...,0.0,0.0,0.0,0.0,0.8,0.2,0.8,1.4,0.0,3


In [3]:
trainData = trainDataFull.loc[:,'v1':'v99']
trainData.head(3)

Unnamed: 0,v1,v10,v100,v101,v102,v103,v11,v12,v13,v14,...,v90,v91,v92,v93,v94,v95,v96,v97,v98,v99
0,1.4,0.0,0.2,1.0,4.2,0.4,0.0,0.0,0.0,1.2,...,0.2,0.6,0.2,0.0,3.2,1.0,0.2,0.0,1.6,0.4
1,0.0,0.0,0.0,2.8,0.0,0.8,0.0,0.2,1.2,1.4,...,0.0,0.0,0.0,1.2,0.0,1.2,0.2,0.2,2.6,2.2
2,0.0,0.0,0.0,0.4,0.0,0.6,0.8,0.0,0.0,0.2,...,0.0,0.0,0.0,0.0,0.0,0.8,0.2,0.8,1.4,0.0


In [4]:
trainLabels = trainDataFull.loc[:,'target']
trainLabels.unique()

array([9, 6, 3, 4, 2, 8, 7, 1, 5])

In [5]:
# encode string class values as integers
label_encoder = LabelEncoder()
label_encoder = label_encoder.fit(trainLabels)
label_encoded_y = label_encoder.transform(trainLabels)
label_encoded_y

array([8, 5, 2, ..., 7, 4, 2])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(trainData.values, 
                                                    label_encoded_y, 
                                                    test_size = 0.05, 
                                                    random_state = 33,
                                                    shuffle = True,
                                                    stratify = label_encoded_y)

## FUSION

In [8]:
valid_RFC = pd.read_csv("./results/valid-submission-xgboost-1-optimal.csv")
valid_RFC.head()

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9
0,0.000293,0.000461,0.000714,0.000118,2.5e-05,0.001526,0.000245,0.996494,0.000124
1,0.000115,2e-05,2e-05,1e-05,5e-06,0.000596,0.000219,0.998951,6.4e-05
2,0.003406,8.5e-05,0.000136,0.000131,0.000117,0.001847,0.007751,0.98611,0.000418
3,0.002974,0.000331,0.000283,0.000517,2.5e-05,0.002343,0.000145,0.000472,0.992909
4,0.001298,0.000996,0.000835,0.006854,0.000119,0.968363,0.003845,0.015608,0.002083


In [10]:
valid_xgboost = pd.read_csv("./results/valid-submission-xgboost-2-optimal.csv")
valid_xgboost.head()

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9
0,0.000101,0.000115,0.000221,4.2e-05,1.7e-05,0.000661,3.3e-05,0.998791,1.8e-05
1,5.1e-05,7e-06,9e-06,2e-06,1.2e-05,0.000577,9.8e-05,0.999205,3.8e-05
2,0.004228,3.7e-05,4.5e-05,7.9e-05,0.000183,0.002042,0.006516,0.986677,0.000193
3,0.001666,8e-05,0.000136,0.000307,6e-05,0.002291,7.7e-05,0.000434,0.99495
4,0.001465,0.000501,0.000218,0.002832,0.000322,0.967463,0.003802,0.021743,0.001654


### Standart Fusion

In [11]:
# Exahustive Search
for i in np.arange(0, 1.1, 0.01):
    fusionX = (i*valid_RFC.values + (1-i)*valid_xgboost.values)
    predictions = np.argmax(fusionX,1)
    accuracy = accuracy_score(y_test, predictions)
    print("Weight: %.2f Accuracy: %.2f%%" % (i ,accuracy * 100.0))
    #82.43

Weight: 0.00 Accuracy: 83.48%
Weight: 0.01 Accuracy: 83.48%
Weight: 0.02 Accuracy: 83.52%
Weight: 0.03 Accuracy: 83.52%
Weight: 0.04 Accuracy: 83.52%
Weight: 0.05 Accuracy: 83.52%
Weight: 0.06 Accuracy: 83.48%
Weight: 0.07 Accuracy: 83.45%
Weight: 0.08 Accuracy: 83.48%
Weight: 0.09 Accuracy: 83.58%
Weight: 0.10 Accuracy: 83.58%
Weight: 0.11 Accuracy: 83.55%
Weight: 0.12 Accuracy: 83.55%
Weight: 0.13 Accuracy: 83.58%
Weight: 0.14 Accuracy: 83.55%
Weight: 0.15 Accuracy: 83.58%
Weight: 0.16 Accuracy: 83.58%
Weight: 0.17 Accuracy: 83.61%
Weight: 0.18 Accuracy: 83.61%
Weight: 0.19 Accuracy: 83.61%
Weight: 0.20 Accuracy: 83.61%
Weight: 0.21 Accuracy: 83.65%
Weight: 0.22 Accuracy: 83.68%
Weight: 0.23 Accuracy: 83.61%
Weight: 0.24 Accuracy: 83.65%
Weight: 0.25 Accuracy: 83.65%
Weight: 0.26 Accuracy: 83.65%
Weight: 0.27 Accuracy: 83.68%
Weight: 0.28 Accuracy: 83.68%
Weight: 0.29 Accuracy: 83.74%
Weight: 0.30 Accuracy: 83.65%
Weight: 0.31 Accuracy: 83.61%
Weight: 0.32 Accuracy: 83.52%
Weight: 0.

In [12]:
fusion_standart = (0.29*valid_RFC.values + 0.71*valid_xgboost.values)
fusion_standart_score = pd.DataFrame(fusion_standart, columns=['c1','c2','c3','c4','c5','c6','c7','c8','c9'])
fusion_standart_score

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9
0,0.000157,0.000215,0.000364,0.000064,0.000019,0.000912,0.000095,0.998125,0.000049
1,0.000070,0.000011,0.000012,0.000004,0.000010,0.000583,0.000134,0.999131,0.000046
2,0.003990,0.000051,0.000071,0.000094,0.000164,0.001985,0.006874,0.986512,0.000258
3,0.002046,0.000153,0.000178,0.000368,0.000050,0.002306,0.000097,0.000445,0.994358
4,0.001416,0.000644,0.000397,0.003998,0.000263,0.967724,0.003815,0.019964,0.001778
...,...,...,...,...,...,...,...,...,...
3089,0.001220,0.699533,0.121080,0.046234,0.000441,0.002600,0.126199,0.002391,0.000301
3090,0.000210,0.082008,0.051539,0.860543,0.000701,0.000532,0.001301,0.000455,0.002710
3091,0.001883,0.000013,0.000014,0.000166,0.000032,0.996389,0.000699,0.000741,0.000063
3092,0.000850,0.088157,0.885764,0.021641,0.000605,0.000867,0.000554,0.000394,0.001169


In [13]:
fusion_standart_score.to_csv('./results/valid-fusion-optimal.csv', index = False)

### Normalized Fusion

In [None]:
def tanh_normalize(x):
    
    y = x.transpose()
    m = np.mean(y, axis=0)
    std = np.std(y, axis=0)
    normalized = 0.5 * (np.tanh(0.01 * ((y - m / std)) + 1))

    return normalized.transpose()

In [None]:
RFC_normalized = tanh_normalize(valid_RFC.values)
xgboost_normalized = tanh_normalize(valid_xgboost.values)

In [None]:
# Exahustive Search
for i in np.arange(0, 1.1, 0.1):
    fusionX = (i*RFC_normalized + (1-i)*xgboost_normalized)
    predictions = np.argmax(fusionX,1)
    accuracy = accuracy_score(y_test, predictions)
    print("Weight: %.2f Accuracy: %.2f%%" % (i ,accuracy * 100.0))

In [None]:
fusion_normalized = (0.5*RFC_normalized + 0.5*xgboost_normalized)
fusion_normalized_score = pd.DataFrame(fusion_normalized, columns=['c1','c2','c3','c4','c5','c6','c7','c8','c9'])
fusion_normalized_score.head()

In [None]:
scaler = MinMaxScaler()
scaler.fit(fusion_normalized)
MinMax_reconstruct = scaler.transform(fusion_normalized)

In [None]:
MinMax_reconstruct = softmax(minmax_scale(fusion_normalized,axis=1),axis=1)
fusion_reconstruct_score = pd.DataFrame(MinMax_reconstruct, columns=['c1','c2','c3','c4','c5','c6','c7','c8','c9'])
fusion_reconstruct_score.head(100)

In [None]:
fusion_reconstruct_score.to_csv('./results/valid-fusion-normalized.csv', index = False)

## TEST DATA

In [8]:
test_RFC = pd.read_csv("./results/test-submission-model-nn.csv")
test_RFC.head()

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9
0,0.0001000881,0.2973343,0.3732213,0.3219612,8.985221e-08,6.293251e-05,0.007315,5.150354e-06,4.378657e-07
1,0.0002459792,0.0003173853,1.044408e-05,3.075071e-06,2.177567e-07,0.7232735,0.000262,0.2753139,0.000573809
2,3.064437e-07,4.021253e-08,1.25542e-08,1.021791e-08,2.078018e-08,0.9999857,3e-06,1.154281e-05,1.248012e-07
3,1.762037e-07,0.7405864,0.2571583,0.002252082,4.259134e-08,6.713095e-08,2e-06,2.728781e-07,9.686786e-07
4,0.3927379,2.821333e-06,3.308079e-07,1.022413e-07,1.701025e-07,0.001261078,0.000283,0.02064538,0.5850698


In [9]:
test_xgboost = pd.read_csv("./results/submission-model-2-optimal.csv")
test_xgboost.head()

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9
0,0.000446,0.078695,0.108564,0.808224,0.000431,7.1e-05,0.003252,0.000257,6e-05
1,0.00429,0.025319,0.001983,0.001405,0.001005,0.137486,0.00349,0.82305,0.001972
2,5.8e-05,1.4e-05,4.1e-05,2.2e-05,2.2e-05,0.999038,6.3e-05,0.00071,3.1e-05
3,0.000563,0.763614,0.221859,0.008738,0.000502,0.000344,0.00045,0.00122,0.002711
4,0.042744,0.000797,0.00035,0.000127,0.001041,0.002581,0.00074,0.029084,0.922536


In [10]:
test_fusion_standart = (0.29*test_RFC.values + 0.71*test_xgboost.values)
test_fusion_standart_score = pd.DataFrame(test_fusion_standart, columns=['c1','c2','c3','c4','c5','c6','c7','c8','c9'])
test_fusion_standart_score.head()

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9
0,0.000346,0.1421,0.185315,0.667208,0.000306,6.9e-05,0.00443,0.000184,4.3e-05
1,0.003117,0.018069,0.001411,0.000998,0.000714,0.307364,0.002554,0.664207,0.001566
2,4.1e-05,1e-05,2.9e-05,1.6e-05,1.6e-05,0.999313,4.6e-05,0.000507,2.2e-05
3,0.0004,0.756936,0.232096,0.006857,0.000357,0.000244,0.00032,0.000866,0.001925
4,0.144242,0.000566,0.000249,9e-05,0.000739,0.002198,0.000608,0.026637,0.824671


In [11]:
test_fusion_standart_score.to_csv('./results/test-fusion-nn.csv', index = False)