In [None]:
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

import tensorflow as tf
import keras

from sklearn.metrics import confusion_matrix, classification_report


In [None]:
!python --version

Python 3.10.12


In [None]:
# importing the models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# New Models
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def load_split_data(csv_file):
  model_df = pd.read_csv(csv_file)
  y = model_df['category'].copy()
  X = model_df.drop('category', axis=1).copy()

  scaler = StandardScaler()

  X = scaler.fit_transform(X)
  return X, y


# X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)


In [None]:
def shuffle_feature_and_label():
  features = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Schizophrenia Classification Project/Features/relevant_timeseries_features.csv")
  labels = pd.read_csv("/content/drive/My Drive/Colab Notebooks/Schizophrenia Classification Project/Data/labels.csv")
  features['labels'] = labels
  # shuffle the dataset
  shuffled_df = features.sample(frac=1).reset_index(drop=True)
  return shuffled_df

In [None]:
# shuffle the data before splitting
df = shuffle_feature_and_label()

In [None]:
# extract X and y
X = df.iloc[:, :-1]
y = df['labels']

In [None]:



X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=123)

In [None]:
# X, y = load_split_data("/content/drive/My Drive/Colab Notebooks/Data/model_data.csv")

In [None]:
# Comparing with default hyperparameters

# list of models
models = [LogisticRegression(max_iter=1000), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier(random_state=0)]

def compare_models_cross_validation(list_of_models=[]):

  for model in list_of_models:

    cv_score = cross_val_score(model, X, y, cv=5)
    mean_accuracy = sum(cv_score)/len(cv_score)
    mean_accuracy = mean_accuracy*100
    mean_accuracy = round(mean_accuracy, 2)

    print('Cross Validation accuracies for the',model,'=', cv_score)
    print('Acccuracy score of the ',model,'=',mean_accuracy,'%')
    print('---------------------------------------------------------------')

In [None]:
compare_models_cross_validation(models)

Cross Validation accuracies for the LogisticRegression(max_iter=1000) = [0.6122449  0.57142857 0.6122449  0.63265306 0.59183673]
Acccuracy score of the  LogisticRegression(max_iter=1000) = 60.41 %
---------------------------------------------------------------
Cross Validation accuracies for the SVC(kernel='linear') = [0.63265306 0.6122449  0.6122449  0.63265306 0.53061224]
Acccuracy score of the  SVC(kernel='linear') = 60.41 %
---------------------------------------------------------------
Cross Validation accuracies for the KNeighborsClassifier() = [0.67346939 0.57142857 0.6122449  0.57142857 0.48979592]
Acccuracy score of the  KNeighborsClassifier() = 58.37 %
---------------------------------------------------------------
Cross Validation accuracies for the RandomForestClassifier(random_state=0) = [0.67346939 0.59183673 0.59183673 0.57142857 0.51020408]
Acccuracy score of the  RandomForestClassifier(random_state=0) = 58.78 %
----------------------------------------------------------

In [None]:
more_models = [GaussianNB(), GradientBoostingClassifier()]

In [None]:
compare_models_cross_validation(more_models)

Cross Validation accuracies for the GaussianNB() = [0.6122449  0.59183673 0.57142857 0.59183673 0.48979592]
Acccuracy score of the  GaussianNB() = 57.14 %
---------------------------------------------------------------
Cross Validation accuracies for the GradientBoostingClassifier() = [0.65306122 0.57142857 0.6122449  0.57142857 0.51020408]
Acccuracy score of the  GradientBoostingClassifier() = 58.37 %
---------------------------------------------------------------


In [None]:
# Comparing the models with different Hyperparameter values using GridSearchCV

# list of models
models_list = [LogisticRegression(max_iter=10000), SVC(), KNeighborsClassifier(), RandomForestClassifier(random_state=0)]

In [None]:
# creating a dictionary that contains hyperparameter values for the above mentioned models


model_hyperparameters = {


    'log_reg_hyperparameters': {

        'C' : [1,5,10,20]
    },

    'svc_hyperparameters': {

        'kernel' : ['linear','poly','rbf','sigmoid'],
        'C' : [1,5,10,20]
    },


    'KNN_hyperparameters' : {

        'n_neighbors' : [3,5,10]
    },


    'random_forest_hyperparameters' : {

        'n_estimators' : [10, 20, 50, 100]
    }
}

In [None]:
model_keys = list(model_hyperparameters.keys())
print(model_keys)

['log_reg_hyperparameters', 'svc_hyperparameters', 'KNN_hyperparameters', 'random_forest_hyperparameters']


In [None]:
# Applying GridSearchCV

def ModelSelection(list_of_models, hyperparameters_dictionary):

  result = []

  i = 0

  for model in list_of_models:

    key = model_keys[i]

    params = hyperparameters_dictionary[key]

    i += 1

    print(model)
    print(params)
    print('---------------------------------')


    classifier = GridSearchCV(model, params, cv=5)

    # fitting the data to classifier
    classifier.fit(X,y)

    result.append({
        'model used' : model,
        'highest score' : classifier.best_score_,
        'best hyperparameters' : classifier.best_params_
    })

  result_dataframe = pd.DataFrame(result, columns = ['model used','highest score','best hyperparameters'])

  return result_dataframe

In [None]:
ModelSelection(models_list, model_hyperparameters)

LogisticRegression(max_iter=10000)
{'C': [1, 5, 10, 20]}
---------------------------------
SVC()
{'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': [1, 5, 10, 20]}
---------------------------------


### The best of these classifiers is Support vector Classifier with 68% accuracy


