In [None]:
import pandas as pd
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import * 
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import pickle

In [None]:
total_df = pd.read_csv('total_sequences.csv', index_col = 0)

In [None]:
# Split train and test
total_train, total_test = model_selection.train_test_split(total_df, test_size = 0.3, stratify = total_df['label'], random_state = 1)

In [None]:
# Split data and label
X_train, y_train = total_train.iloc[:, :-1], total_train.iloc[:, -1]
X_test, y_test = total_test.iloc[:, :-1], total_test.iloc[:, -1]

In [None]:
# Function of model fitting, GridSearchCV, and evaluation
def model_fit(model, params, X_train, y_train, X_test, y_test):
    gs = GridSearchCV(model, params, cv=5, n_jobs=50)
    gs.fit(X_train, y_train)
    tmp = pd.DataFrame(gs.cv_results_)
    best_params = gs.best_params_
    cv_results = tmp[['params', 'mean_test_score', 'std_test_score']][tmp['params'] == best_params]
    model.set_params(**best_params)
    model.fit(X_train, y_train)
    y_prob = model.predict_proba(X_test)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred),
    fpr, tpr, thr = roc_curve(y_test, y_prob[:, 1])
    roc_auc = auc(fpr, tpr)

    return {'cv_results': cv_results, 'acc': acc, 'f1': f1, 'fpr': fpr, 'tpr': tpr, 'thr': thr, 'roc_auc': roc_auc}

In [None]:
# 1. Logistic regression
clf = LogisticRegression(random_state=1)
params = {
    'C': [0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['liblinear', 'saga']
}
lr_results = model_fit(clf, params, X_train, y_train, X_test, y_test)
with open('lr_results.pkl', 'wb') as f:
    pickle.dump(lr_results, f)

In [None]:
# 2. Gaussian Naive Bayes
clf = GaussianNB()
params = {
    'var_smoothing': [1e-09, 1e-08, 1e-07, 1e-06]
}
gn_results = model_fit(clf, params, X_train, y_train, X_test, y_test)
with open('gn_results.pkl', 'wb') as f:
    pickle.dump(gn_results, f)

In [None]:
# 3. Random Forest
clf = RandomForestClassifier(random_state = 1, n_jobs = 3)
params = { 
    'n_estimators': [20, 60, 100],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [2, 4, 6, 8],
    'criterion' : ['gini', 'entropy', 'log_loss']
}
rf_results = model_fit(clf, params, X_train, y_train, X_test, y_test)
with open('rf_results.pkl', 'wb') as f:
    pickle.dump(rf_results, f)

In [None]:
# 4. Multi-Layer Perceptron
clf = MLPClassifier(random_state=1, max_iter = 300)
params = {
    'hidden_layer_sizes': [(i, 2) for i in range(1, 10, 3)],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'alpha': [0.0001, 0.001, 0.005],
    'solver': ['lbfgs', 'sgd', 'adam']
}
mlp_results = model_fit(clf, params, X_train, y_train, X_test, y_test)
with open('mlp_results.pkl', 'wb') as f:
    pickle.dump(mlp_results, f)