In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import joblib
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
import os

# Create a directory for saving models and scaler
if not os.path.exists('model_data'):
    os.makedirs('model_data')

# Create a directory for output images if it does not exist
if not os.path.exists('output_images'):
    os.makedirs('output_images')


def load_data(data_path, training_data_name):
    # Load the dataset
    data = pd.read_csv(data_path)

    # Split the data into X (the input features) and y (the target)
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]

    # Save feature names
    feature_names = X.columns.tolist()
    joblib.dump(feature_names, f'model_data/{training_data_name}/feature_names.pkl')

    return X, y


def scale_and_split_data(X, y, training_data_name, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Save the trained scaler
    joblib.dump(scaler, f'model_data/{training_data_name}/trained_scaler.pkl')

    return X_train_scaled, X_test_scaled, y_train, y_test


def train_logreg(X_train_scaled, y_train, training_data_name):
    logreg = LogisticRegression(random_state=42)
    param_grid = {
        'solver': ['newton-cg', 'lbfgs', 'sag', 'saga', 'liblinear'],
        'penalty': ['l2'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]  # Inverse of regularization strength
    }

    param_grid_l1 = {
        'solver': ['liblinear', 'saga'],
        'penalty': ['l1'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]  # Inverse of regularization strength
    }

    param_grid_elasticnet = {
        'solver': ['saga'],
        'penalty': ['elasticnet'],
        'l1_ratio': [i / 10.0 for i in range(11)],  # Increments of 0.1 from 0 to 1
        'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]  # Inverse of regularization strength
    }

    param_grid = [param_grid, param_grid_l1, param_grid_elasticnet]
    grid = GridSearchCV(logreg, param_grid, cv=5, verbose=True, n_jobs=-1)

    grid.fit(X_train_scaled, y_train)
    # Save the trained model
    model_file = f'model_data/{training_data_name}/trained_model_logreg.pkl'
    if os.path.exists(model_file):
        overwrite = input("Model file already exists. Do you want to overwrite it? (yes/no) ")
        if overwrite.lower() != "yes":
            return
    joblib.dump(grid.best_estimator_, f'model_data/{training_data_name}/trained_model_logreg.pkl')

    return grid.best_estimator_


def load_model_and_scaler(model_file, scaler_file):
    # Load the model and scaler
    model = joblib.load(model_file)
    scaler = joblib.load(scaler_file)

    return model, scaler


def train_svm(X_train_scaled, y_train, training_data_name):
    svm = SVC(random_state=42,probability=True)
    # Define the parameter grid for SVM
    param_grid = {'C': [0.1, 1, 10, 100, 1000],
                  'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                  'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

    grid = GridSearchCV(svm, param_grid, refit=True, verbose=3)

    grid.fit(X_train_scaled, y_train)

    model_file = f'model_data/{training_data_name}/trained_model_svm.pkl'
    if os.path.exists(model_file):
        overwrite = input("Model file already exists. Do you want to overwrite it? (yes/no) ")
        if overwrite.lower() != "yes":
            return

    joblib.dump(grid.best_estimator_, f'model_data/{training_data_name}/trained_model_svm.pkl')
    return grid.best_estimator_


def train_rf(X_train_scaled, y_train, training_data_name):
    rf = RandomForestClassifier(random_state=42)
    param_grid = {
        'n_estimators': [10, 50, 100, 200],  # The number of trees in the forest
        'max_features': ['sqrt', 'log2'],  # The number of features to consider when looking for the best split
        'max_depth': [None, 10, 20, 30, 40, 50],  # The maximum depth of the tree
        'min_samples_split': [2, 5, 10],  # The minimum number of samples required to split an internal node
        'min_samples_leaf': [1, 2, 4],  # The minimum number of samples required to be at a leaf node
        'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
    }
    grid = GridSearchCV(rf, param_grid, cv=5, verbose=True, n_jobs=-1)
    grid.fit(X_train_scaled, y_train)
    model_file = f'model_data/{training_data_name}/trained_model_rf.pkl'
    if os.path.exists(model_file):
        overwrite = input("Model file already exists. Do you want to overwrite it? (yes/no) ")
        if overwrite.lower() != "yes":
            return

    joblib.dump(grid.best_estimator_, f'model_data/{training_data_name}/trained_model_rf.pkl')
    return grid.best_estimator_


def train_ensemble(X_train_scaled, y_train, training_data_name):
    # Load the trained models and scalers
    svm = joblib.load(f'model_data/{training_data_name}/trained_model_svm.pkl')
    rf = joblib.load(f'model_data/{training_data_name}/trained_model_rf.pkl')
    log_reg = joblib.load(f'model_data/{training_data_name}/trained_model_logreg.pkl')

    # Create a list of tuples, each tuple containing the string identifier and the model
    models = [('svm', svm), ('rf', rf), ('log_reg', log_reg)]

    # Create the ensemble model
    ensemble = VotingClassifier(estimators=models, voting='soft')

    # Fit the ensemble model on the scaled training data
    ensemble.fit(X_train_scaled, y_train)
    model_file = f'model_data/{training_data_name}/trained_model_ensemble.pkl'
    if os.path.exists(model_file):
        overwrite = input("Model file already exists. Do you want to overwrite it? (yes/no) ")
        if overwrite.lower() != "yes":
            return
    joblib.dump(ensemble, f'model_data/{training_data_name}/trained_model_ensemble.pkl')
    return ensemble


def make_prediction(model, X_test_scaled):
    # Make predictions
    y_pred = model.predict(X_test_scaled)

    return y_pred


In [2]:
# specify the name of your training dataset and load your data
training_data_name = "gpt-j1x"
data_path = "data_matrix_gpt-j1x.csv"  # update this to your actual file path

# Load the data
X, y = load_data(data_path, training_data_name)

# Create sub-folder for each training set
if not os.path.exists(f'model_data/{training_data_name}'):
    os.makedirs(f'model_data/{training_data_name}')

# Scale and split the data
X_train_scaled, X_test_scaled, y_train, y_test = scale_and_split_data(X, y, training_data_name)

# Train logistic regression
print("Training logistic regression model...")
logreg_model = train_logreg(X_train_scaled, y_train, training_data_name)




Training logistic regression model...
Fitting 5 folds for each of 126 candidates, totalling 630 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model file already exists. Do you want to overwrite it? (yes/no) YES


In [3]:
def print_classification_report(y_test, y_pred):
    print('Classification Report: \n', classification_report(y_test, y_pred))

In [4]:
logreg_predictions = make_prediction(logreg_model, X_test_scaled)


In [6]:
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, classification_report


In [8]:
print_classification_report(y_test,logreg_predictions)

Classification Report: 
               precision    recall  f1-score   support

           0       0.85      0.87      0.86       178
           1       0.84      0.82      0.83       152

    accuracy                           0.85       330
   macro avg       0.85      0.85      0.85       330
weighted avg       0.85      0.85      0.85       330



In [10]:
# Train SVM
print("Training SVM model...")
svm_model = train_svm(X_train_scaled, y_train, training_data_name)

# Train random forest
print("Training random forest model...")
rf_model = train_rf(X_train_scaled, y_train, training_data_name)

# Train ensemble
print("Training ensemble model...")
ensemble_model = train_ensemble(X_train_scaled, y_train, training_data_name)


Training SVM model...
Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.826 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.864 total time=   0.0s
[CV 3/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.845 total time=   0.0s
[CV 4/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.864 total time=   0.0s
[CV 5/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.864 total time=   0.0s
[CV 1/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.833 total time=   0.0s
[CV 2/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.841 total time=   0.0s
[CV 3/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.864 total time=   0.0s
[CV 4/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.852 total time=   0.1s
[CV 5/5] END .......C=0.1, gamma=1, kernel=poly;, score=0.833 total time=   0.0s
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.519 total time=   0.2s
[CV 2/5] END ........C=0

[CV 2/5] END .......C=1, gamma=1, kernel=linear;, score=0.864 total time=   0.2s
[CV 3/5] END .......C=1, gamma=1, kernel=linear;, score=0.848 total time=   0.2s
[CV 4/5] END .......C=1, gamma=1, kernel=linear;, score=0.886 total time=   0.2s
[CV 5/5] END .......C=1, gamma=1, kernel=linear;, score=0.864 total time=   0.2s
[CV 1/5] END .........C=1, gamma=1, kernel=poly;, score=0.833 total time=   0.0s
[CV 2/5] END .........C=1, gamma=1, kernel=poly;, score=0.841 total time=   0.1s
[CV 3/5] END .........C=1, gamma=1, kernel=poly;, score=0.864 total time=   0.1s
[CV 4/5] END .........C=1, gamma=1, kernel=poly;, score=0.852 total time=   0.1s
[CV 5/5] END .........C=1, gamma=1, kernel=poly;, score=0.833 total time=   0.0s
[CV 1/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.652 total time=   0.2s
[CV 2/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.678 total time=   0.2s
[CV 3/5] END ..........C=1, gamma=1, kernel=rbf;, score=0.689 total time=   0.2s
[CV 4/5] END ..........C=1, 

[CV 4/5] END ......C=10, gamma=1, kernel=linear;, score=0.886 total time=   1.7s
[CV 5/5] END ......C=10, gamma=1, kernel=linear;, score=0.864 total time=   1.2s
[CV 1/5] END ........C=10, gamma=1, kernel=poly;, score=0.833 total time=   0.0s
[CV 2/5] END ........C=10, gamma=1, kernel=poly;, score=0.841 total time=   0.1s
[CV 3/5] END ........C=10, gamma=1, kernel=poly;, score=0.864 total time=   0.1s
[CV 4/5] END ........C=10, gamma=1, kernel=poly;, score=0.852 total time=   0.1s
[CV 5/5] END ........C=10, gamma=1, kernel=poly;, score=0.833 total time=   0.1s
[CV 1/5] END .........C=10, gamma=1, kernel=rbf;, score=0.659 total time=   0.3s
[CV 2/5] END .........C=10, gamma=1, kernel=rbf;, score=0.701 total time=   0.3s
[CV 3/5] END .........C=10, gamma=1, kernel=rbf;, score=0.716 total time=   0.3s
[CV 4/5] END .........C=10, gamma=1, kernel=rbf;, score=0.693 total time=   0.3s
[CV 5/5] END .........C=10, gamma=1, kernel=rbf;, score=0.720 total time=   0.3s
[CV 1/5] END .....C=10, gamm

[CV 2/5] END .......C=100, gamma=1, kernel=poly;, score=0.841 total time=   0.1s
[CV 3/5] END .......C=100, gamma=1, kernel=poly;, score=0.864 total time=   0.1s
[CV 4/5] END .......C=100, gamma=1, kernel=poly;, score=0.852 total time=   0.1s
[CV 5/5] END .......C=100, gamma=1, kernel=poly;, score=0.833 total time=   0.0s
[CV 1/5] END ........C=100, gamma=1, kernel=rbf;, score=0.659 total time=   0.3s
[CV 2/5] END ........C=100, gamma=1, kernel=rbf;, score=0.701 total time=   0.3s
[CV 3/5] END ........C=100, gamma=1, kernel=rbf;, score=0.716 total time=   0.3s
[CV 4/5] END ........C=100, gamma=1, kernel=rbf;, score=0.693 total time=   0.3s
[CV 5/5] END ........C=100, gamma=1, kernel=rbf;, score=0.720 total time=   0.3s
[CV 1/5] END ....C=100, gamma=1, kernel=sigmoid;, score=0.553 total time=   0.1s
[CV 2/5] END ....C=100, gamma=1, kernel=sigmoid;, score=0.576 total time=   0.1s
[CV 3/5] END ....C=100, gamma=1, kernel=sigmoid;, score=0.500 total time=   0.1s
[CV 4/5] END ....C=100, gamm

[CV 4/5] END ......C=1000, gamma=1, kernel=poly;, score=0.852 total time=   0.1s
[CV 5/5] END ......C=1000, gamma=1, kernel=poly;, score=0.833 total time=   0.0s
[CV 1/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.659 total time=   0.2s
[CV 2/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.701 total time=   0.2s
[CV 3/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.716 total time=   0.2s
[CV 4/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.693 total time=   0.2s
[CV 5/5] END .......C=1000, gamma=1, kernel=rbf;, score=0.720 total time=   0.2s
[CV 1/5] END ...C=1000, gamma=1, kernel=sigmoid;, score=0.553 total time=   0.1s
[CV 2/5] END ...C=1000, gamma=1, kernel=sigmoid;, score=0.576 total time=   0.1s
[CV 3/5] END ...C=1000, gamma=1, kernel=sigmoid;, score=0.500 total time=   0.1s
[CV 4/5] END ...C=1000, gamma=1, kernel=sigmoid;, score=0.534 total time=   0.1s
[CV 5/5] END ...C=1000, gamma=1, kernel=sigmoid;, score=0.610 total time=   0.1s
[CV 1/5] END ..C=1000, gamma

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
svm_predictions = make_prediction(svm_model, X_test_scaled)


In [12]:
print_classification_report(y_test,svm_predictions)

Classification Report: 
               precision    recall  f1-score   support

           0       0.89      0.90      0.90       178
           1       0.89      0.88      0.88       152

    accuracy                           0.89       330
   macro avg       0.89      0.89      0.89       330
weighted avg       0.89      0.89      0.89       330



In [13]:
rf_predictions = make_prediction(rf_model, X_test_scaled)


In [14]:
print_classification_report(y_test,rf_predictions)

Classification Report: 
               precision    recall  f1-score   support

           0       0.91      0.92      0.91       178
           1       0.90      0.89      0.89       152

    accuracy                           0.90       330
   macro avg       0.90      0.90      0.90       330
weighted avg       0.90      0.90      0.90       330



In [15]:
ensemble_predictions = make_prediction(ensemble_model, X_test_scaled)


In [16]:
print_classification_report(y_test,ensemble_predictions)

Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.94      0.91       178
           1       0.92      0.86      0.89       152

    accuracy                           0.90       330
   macro avg       0.90      0.90      0.90       330
weighted avg       0.90      0.90      0.90       330

