In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold

In [None]:
from sklearn.neural_network import MLPClassifier
# from keras.utils import to_categorical

def to_categorical(y, num_classes=None, dtype='float32'):
    y = np.array(y, dtype='int')
    input_shape = y.shape
    if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
        input_shape = tuple(input_shape[:-1])
    y = y.ravel()
    if not num_classes:
        num_classes = np.max(y) + 1
    n = y.shape[0]
    categorical = np.zeros((n, num_classes), dtype=dtype)
    categorical[np.arange(n), y] = 1
    output_shape = input_shape + (num_classes,)
    categorical = np.reshape(categorical, output_shape)
    return categorical

In [None]:
# import data
df_train = pd.read_csv('train.csv')
df_test  = pd.read_csv('test.csv')

x_train_raw = df_train.values[:,0]
y_true = df_train.values[:,1].astype(float)
x_test_raw = df_test.values

# convert the characters to numbers
Dict = {"A":0, "B":1, "C":2, "D":1, "E":3, "F":4, "G":5, "H":6, "I":7, "K":8, "L":9, "M":10, "N":11, "P":12,
     "Q":13, "R":14, "S":15, "T":16, "U":17, "V":18, "W":19, "Y":20}

# x_train = np.zeros((len(x_train_raw), 4), dtype=int)
x_train = np.zeros((len(x_train_raw), 4))
for i in range(len(x_train_raw)):
    molecule = x_train_raw[i]
    x_train[i, :] = [Dict[str(molecule)[0]],
                     Dict[str(molecule)[1]], 
                     Dict[str(molecule)[2]],
                     Dict[str(molecule)[3]]]

x_test = np.zeros((len(x_test_raw), 4))
for i in range(len(x_test_raw)):
    molecule = x_test_raw[i].squeeze()
    x_test[i, :] = [Dict[str(molecule)[0]],
                    Dict[str(molecule)[1]],
                    Dict[str(molecule)[2]],
                    Dict[str(molecule)[3]]]

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(256), activation='relu', solver='adam', alpha=0.0001, 
                    batch_size='auto', learning_rate_init=0.001, max_iter=500, shuffle=True, 
                    random_state=np.random.seed(0), tol=0.0001, warm_start=False, early_stopping=True, 
                    validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10)

cnt = 0
# 10 fold cross-validation
num_fold = 10
skf = StratifiedKFold(n_splits=num_fold, random_state=None, shuffle=False)
score = 0
for train_index, val_index in skf.split(x_train, y_true): # split the data into validation and training sets
    print(cnt, 'starting .. ', end="")
    
    X_train, X_val = x_train[train_index], x_train[val_index]
    Y_train, Y_val = y_true[train_index], y_true[val_index]
    
    X_train_cat = to_categorical(X_train).reshape(len(train_index), -1)
    X_val_cat = to_categorical(X_val).reshape(len(val_index), -1)
    
    mlp.fit(X_train_cat, Y_train) # apply model to training data
    Y_est = mlp.predict(X_val_cat) # predict labels using validation data

    score += f1_score(Y_val, Y_est)
    print(f1_score(Y_val, Y_est, pos_label=1))
#     print(f1_score(Y_val, Y_est, pos_label=0))
    print("done")
    cnt += 1
    
# EVALUATION of model
# Calculate average score for all 10 folds
avg_score = score/num_fold
print(avg_score)

In [None]:
# Fit on the whole training set
x_total_cat = to_categorical(x_train).reshape(len(x_train_raw), -1)

mlp = MLPClassifier(hidden_layer_sizes=(256), activation='relu', solver='adam', alpha=0.0001, 
                    batch_size='auto', learning_rate_init=0.001, max_iter=500, shuffle=True, 
                    random_state=np.random.seed(0), tol=0.0001, warm_start=False, early_stopping=True, 
                    validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10)

mlp.fit(x_total_cat, y_true)

In [None]:
# Predict the Labels for the test features
x_test_cat = to_categorical(x_test).reshape(len(x_test_raw), -1)
y_pred = mlp.predict(x_test_cat)
print(y_pred.shape)

In [None]:
# print(y_pred_list)
output = np.array(y_pred).astype(int)
print(np.sum(output))

In [None]:
np.savetxt('predict.csv', output, fmt='%d', delimiter='')