In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearnex import svm
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
from lightgbm import LGBMClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

In [2]:
datasetPath = "datasets/phrase-all-0-120.csv"
target = "healthy"
dataframe = pd.read_csv(datasetPath)
y = dataframe[target]
X = dataframe.drop([target],axis=1)
cv = StratifiedKFold(n_splits=5)
pca = PCA()
scaler = StandardScaler()

In [3]:
##RIDGE CLASSIFIER
print("RIDGE CLASSIFIER")

param_grid = [
        {   'pca__n_components':[30],
           'model__fit_intercept': [True, False],
            'model__alpha': [1e-3, 5e-3, 1e-4, 1e-2, 5e-3],
            'model__solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
        }
       ]


model = RidgeClassifier()
pipe = Pipeline(steps=[("scaler", scaler), ("pca", pca), ("model", model)])
clf = GridSearchCV(estimator=pipe, param_grid=param_grid, cv = cv, scoring='accuracy')
clf.fit(X, y)
print("Best parameters set found on development set:")
print(clf.best_params_)


RIDGE CLASSIFIER


In [None]:
##LOGISTIC REGRESSION
print("LOGISTIC REGRESSION")
param_grid = [
        {
             'pca__n_components':[30],
            'model__penalty' : ['l1', 'l2', 'elasticnet'],
            'model__solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga', 'newton-cholesky'],
            'model__C': [1, 10, 100, 1000]
        }
       ]
model = LogisticRegression()
pipe = Pipeline(steps=[("scaler", scaler), ("pca", pca), ("model", model)])
clf = GridSearchCV(estimator=pipe, param_grid=param_grid, cv = cv, scoring='accuracy')
clf.fit(X,y)
print("Best parameters set found on development set:")
print(clf.best_params_)

In [None]:
##PERCEPTRON
print("PERCEPTRON")
param_grid = [
        {
             'pca__n_components':[30],
            'model__penalty': ['l2','l1','elasticnet'],
            'model__fit_intercept': [True, False]
        }
       ]
model = Perceptron()
pipe = Pipeline(steps=[("scaler", scaler), ("pca", pca), ("model", model)])
clf = GridSearchCV(estimator=pipe, param_grid=param_grid, cv = cv, scoring='accuracy')
clf.fit(X,y)
print("Best parameters set found on development set:")
print(clf.best_params_)

In [None]:
##PASSIVE AGRESSIVE
print("PASSIVE AGRESSIVE")
param_grid = [
        {
              'pca__n_components':[30],
            'model__fit_intercept': [True, False],
            'model__C': [1, 0.1, 0.01, 1.5, 2],
            'model__loss': ['hinge', 'squared_hinge']
        }
       ]
model = PassiveAggressiveClassifier()
pipe = Pipeline(steps=[("scaler", scaler), ("pca", pca), ("model", model)])
clf = GridSearchCV(estimator=pipe, param_grid=param_grid, cv = cv, scoring='accuracy')
clf.fit(X,y)
print("Best parameters set found on development set:")
print(clf.best_params_)

In [None]:
##SVM
print("SVM")
scaler = StandardScaler()
# Fit on the train set only
scaler.fit(X)
# Apply to both the train set and the test set. 
X = scaler.transform(X)

        

# Apply PCA
pca = PCA(n_components=30)
# Fit on the train set only
pca.fit(X)
# Apply transform to both the train set and the test set. 
X = pca.transform(X)


param_grid = [
        {
         
            'kernel' : ['linear', 'rbf', 'sigmoid'],
            'gamma' : ['auto'],
            'C': [1, 10, 100, 1000]
        }
       ]
model = svm.SVC()
pipe = Pipeline(steps=[("scaler", scaler), ("pca", pca), ("model", model)])
clf = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy')
clf.fit(X,y)
print("Best parameters set found on development set:")
print(clf.best_params_)

In [None]:
##SGD CLASSIFIER
print("SGD CLASSIFIER")
param_grid = [
        {
            'pca__n_components':[30],
            'model__loss': ('hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'),
            'model__penalty': ('none','l2', 'l1'),
            'model__alpha': (1e-3, 5e-3, 1e-4, 1e-2, 5e-3),
            'model__fit_intercept': (True, False),
            'model__max_iter' : [2000]
        }
       ]
model = SGDClassifier()
pipe = Pipeline(steps=[("scaler", scaler), ("pca", pca), ("model", model)])
clf = GridSearchCV(estimator=pipe, param_grid=param_grid, cv = cv, scoring='accuracy')
clf.fit(X,y)
print("Best parameters set found on development set:")
print(clf.best_params_)

In [None]:
##RANDOM FOREST
print("RANDOM FOREST")
param_grid = [
        {
            'pca__n_components':[30],
            'model__n_estimators': [200, 500,700],
            'model__max_features': ['auto', 'sqrt', 'log2'],
            'model__max_depth' : [4,5,6,7,8],
            'model__criterion' :['gini', 'entropy']
        }
       ]
model = RandomForestClassifier()
pipe = Pipeline(steps=[("scaler", scaler), ("pca", pca), ("model", model)])
clf = GridSearchCV(estimator=pipe, param_grid=param_grid, cv = cv, scoring='accuracy')
clf.fit(X,y)
print("Best parameters set found on development set:")
print(clf.best_params_)

In [None]:
##ADA BOOST
print("ADA BOOST")


DTC = DecisionTreeClassifier()

model = AdaBoostClassifier(base_estimator=DTC)
param_grid = [
        {
            'pca__n_components':[30],
            'model__n_estimators': [5,10,50,100,200, 500,700],
            'model__learning_rate' : [1,4,5,6,7,8],
            'model__base_estimator__criterion' : ["gini", "entropy"],
            'model__base_estimator__splitter' :  ["best", "random"],
            'model__base_estimator__max_depth' : [1,10,100]
            
        }
       ]
pipe = Pipeline(steps=[("scaler", scaler), ("pca", pca), ("model", model)])
clf = GridSearchCV(estimator=pipe, param_grid=param_grid, cv = cv, scoring='accuracy')
clf.fit(X,y)
print("Best parameters set found on development set:")
print(clf.best_params_)


In [None]:
##GRADIENT BOOST
print("GRADIENT BOOST")
model = GradientBoostingClassifier()
param_grid = [
        {
           'pca__n_components':[30],
           "model__loss":["deviance"],
           "model__learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
           "model__min_samples_split": np.linspace(0.1, 0.5, 12),
           "model__min_samples_leaf": np.linspace(0.1, 0.5, 12),
           "model__max_depth":[3,5,8],
           "model__max_features":["log2","sqrt"],
           "model__criterion": ["friedman_mse",  "mae"],
           "model__subsample":[0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
           "model__n_estimators":[10,20,30]
        }
       ]
pipe = Pipeline(steps=[("scaler", scaler), ("pca", pca), ("model", model)])
clf = GridSearchCV(estimator=pipe, param_grid=param_grid, cv = cv, scoring='accuracy')
clf.fit(X,y)
print("Best parameters set found on development set:")
print(clf.best_params_)

In [None]:
##XBOOST
print("XBOOST")
model = XGBClassifier()
param_grid = [
        {
            'pca__n_components':[30],
            'model__max_depth': range (2, 10, 1),
            'model__n_estimators': range(60, 220, 40),
            'model__learning_rate': [0.1, 0.01, 0.05]

        }
       ]
pipe = Pipeline(steps=[("scaler", scaler), ("pca", pca), ("model", model)])
clf = GridSearchCV(estimator=pipe, param_grid=param_grid, cv = cv, scoring='accuracy')
clf.fit(X,y)
print("Best parameters set found on development set:")
print(clf.best_params_)

In [None]:
##LGBM CLASSIFIER
print("LGBM CLASSIFIER")
model = LGBMClassifier()
param_grid = [
        {
            'pca__n_components':[30],
            'model__num_leaves': [7, 14, 21, 28, 31, 50],
            'model__learning_rate': [0.1, 0.03, 0.003],
            'model__max_depth': [-1, 3, 5],
            'model__n_estimators': [50, 100, 200, 500]
        }
       ]
pipe = Pipeline(steps=[("scaler", scaler), ("pca", pca), ("model", model)])
clf = GridSearchCV(estimator=pipe, param_grid=param_grid, cv = cv, scoring='accuracy')
clf.fit(X,y)
print("Best parameters set found on development set:")
print(clf.best_params_)

In [None]:
#MULTILAYER PERCEPTRON
print("MULTILAYER PERCEPTRON")
model = MLPClassifier()
param_grid = [
        {
            'pca__n_components':[20],
            'model__hidden_layer_sizes': [(1,),(2,),(3,),(4,),(5,),(6,),(7,),(8,),(9,),(10,),(11,),(12,),(13,),(14,),(15,),(16,),(50,50,50), (50,100,50), (100,)],
            'model__activation': ['identity', 'logistic', 'tanh', 'relu'],
            'model__solver':  ['lbfgs', 'sgd', 'adam'],
            'model__alpha': [0.001,0.001, 0.05, 0.1],
            'model__learning_rate': ['constant','adaptive'],
            'model__max_iter':[1000,2000]
        }
       ]
pipe = Pipeline(steps=[("scaler", scaler), ("pca", pca), ("model", model)])
clf = GridSearchCV(estimator=pipe, param_grid=param_grid, cv = cv, scoring='recall')
clf.fit(X,y)
print("Best parameters set found on development set:")
print(clf.best_params_)
