# Masterthesis
## Machine learning

This script executes the machine learning algorithms either with or without pca with the specified hyperparameters across following algorithms:

- RandomForest

- LogisticRegession

- KNeighborsClassifier

- SVM

- GradientBoostingClassifier

**Imports and Definitions**
- The necessary libraries are loaded here and important variables are defined

**Imports and settings for this script**
- Import libraries and set variables for this script

**Preparation for Machine Learning**
- Set path and values

**Machine Learning**
- Run the machine learning algorithmen either with or without pca


## Imports and Definitions

In [3]:
# Import sklearn
import sklearn

# Import pandas
import pandas as pd

# Import numpy
import numpy as np

# To calculate amplitude and phase
import math

# Measure runtime of a jupyter jotebook code cell
from timeit import default_timer as timer

# Used to check if file exists
import os

# Used to check if directory exists
import pathlib

# Import Operation System Calls
import SubOperationSystem

# check os
if os.name == 'nt':
    print("OS is Windows")
    Delimiter = '\\'
    
else:
    print("OS is Linux")
    Delimiter = '/'
    
# Path of datasets (root directory)
PathDataset = 'Dataset' + Delimiter    

# Path of datasets
PathDatasetSub = PathDataset + 'CsiFilesRah' + Delimiter
        
# Path of the converted files
PathConverted = PathDataset + 'Converted' + Delimiter

# Set path for scenario files
PathScenario = PathDataset + 'Scenario' + Delimiter

# Set path for scenario files
PathResult = PathDataset + 'Result' + Delimiter

# Set path for scenario files
PathPlot = PathDataset + 'Plot' + Delimiter

# Set path for scenario files
PathConfig = 'FilesConfig' + Delimiter

# Scenariofile (file with info about the ten scenarios)
FileScenario = 'FileScenario.csv'

# Mappingfile (file with info about original and converted filenames)
FileMapping = 'FileMapping.csv'

OS is Windows


# Imports and settings for this script

In [None]:
# Import Train-Test
from sklearn.model_selection import train_test_split

# Import Classifier RF and GBC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Import Classifier SVC
from sklearn.svm import SVC

# Import Classifier LG
from sklearn.linear_model import LogisticRegression

# Import Classifier KNC
from sklearn.neighbors import KNeighborsClassifier

# Import StandardScaler
from sklearn.preprocessing import StandardScaler

# Import PCA
from sklearn.decomposition import PCA

# Import Pipeline
from sklearn.pipeline import Pipeline

# import metrics and roc_auc_score
from sklearn import metrics

# Import Metrics - accuracy score, confusion matrix, classification report and confusionMatrixDisplay
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay

# Import seaborn
import seaborn as sns

# used for separate windows for graphics
%matplotlib inline

# import pickle to save learned model
import pickle

# Install interpet
from interpret import set_visualize_provider

# Install interpret.provider
from interpret.provider import InlineProvider

# Import ExplainableBoostingClassifier
from interpret.glassbox import ExplainableBoostingClassifier

# Import show
from interpret import show

# Import roc_auc_score
from sklearn.metrics import roc_auc_score

# Import average_precision_score, precision_recall_curve, auc
from sklearn.metrics import average_precision_score, precision_recall_curve, auc

# import mathplot
import matplotlib.pyplot as plt

# Preparation for Machine Learning

In [2]:
# Set PCA to True or False
PCA = True
# PCA = False

# Set value for PCA
n_components = 30

# Counter for Hyperparameter
CounterHp = 0

# Machine learning

In [None]:
# Pca setting
if PCA:
    # Label
    Label = 'PCA'
    
    # Set file extension
    FileExtension = '_pca'
    
    # Hyperparameter file
    FileHyperparameter = 'FileHyperparameterWithPca.csv'

else:
    # Label
    Label = ''

    # Set file extension
    FileExtension = '_pca'

    # Hyperparameter file
    FileHyperparameter = 'FileHyperparameterWithoutPca.csv'
    

# Read scenario mapping file
dfScenario = pd.read_csv(PathConfig + FileScenario, sep=',')

# Read Hyperparameter file
dfHp = pd.read_csv(PathConfig + FileHyperparameter, sep=',')

# Loop through scenario file
for ind in dfScenario.index:
    
    # Get filename of scenario
    Scenario = (dfScenario['Scenario'][ind])
    
    # Search parameter in 
    for idx in dfHp.index:
        HpScenario = (dfHp['Scenario'][idx])
        if Scenario == HpScenario:
            CounterHp = idx
            break

    print(80 * '=')
    print('Summary')
    print(Scenario)
    print(dfHp.iloc[CounterHp])
    print(80 * '=')

    # load scenario file
    df = pd.read_csv(PathDataset + PathScenario + Scenario + '_ah.csv')

    # Select data and remove target
    X = df.drop("label",axis=1)

    # Select target
    y = df['label']

    # Split dataset to train and test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)

    # Set parameter for confusion matriy (size of figure and font size)
    plt.rcParams['figure.figsize'] = (8, 6)
    plt.rcParams['font.size'] = 12

    # Define machine learing models with determined hyperparameters
    models = [
        ('RandomForest', RandomForestClassifier(), {
            'model__max_depth': [dfHp.RF__max_depth[CounterHp]],
            'model__min_samples_leaf': [dfHp.RF__min_samples_leaf[CounterHp]],
            'model__min_samples_split': [dfHp.RF__min_samples_split[CounterHp]],
            'model__n_estimators': [dfHp.RF__n_estimators[CounterHp]]
        }),
         
        ('LogisticRegession', LogisticRegression(), {
            'model__C': [dfHp.LR__C[CounterHp]],
            'model__max_iter': [dfHp.LR__max_iter[CounterHp]],
            'model__penalty': [dfHp.LR__penalty[CounterHp]],
            'model__solver': [dfHp.LR__solver[CounterHp]]
        }),
        
        ('KNeighborsClassifier', KNeighborsClassifier(), {
            'model__metric': [dfHp.KNN__metric[CounterHp]],
            'model__n_neighbors': [dfHp.KNN__n_neighbors[CounterHp]],
            'model__weights': [dfHp.KNN__weights[CounterHp]]
        }),
        
        ('SVM', SVC(), {
            'model__C': [dfHp.SVM__C[CounterHp]], 
            'model__gamma': [dfHp.SVM__gamma[CounterHp]], 
            'model__kernel': [dfHp.SVM__kernel[CounterHp]]
            }),
        
        ('GradientBoosting', GradientBoostingClassifier(), {
            'model__max_depth': [dfHp.GBC__max_depth[CounterHp]], 
            'model__min_samples_leaf': [dfHp.GBC__min_samples_leaf[CounterHp]], 
            'model__min_samples_split': [dfHp.GBC__min_samples_split[CounterHp]]
            }),
    ]

    # Print scenario name
    print(Scenario)
    print (80 * '=')
        
    # Iteration over machine learning models
    for model_name, model, param_grid in models:
        
        # Check if result file exists then take next model
        if SubOperationSystem.checkIfFileExists(PathResult + Scenario + '_' + model_name + FileExtension + '.csv', False):
            continue

        # Print informations
        print(f"Training for {model_name} ...")
        print (60 * '=')

        # With or without pca
        if PCA:
        
            # Creata a pipeline with scaler, pca and model
            pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('pca', PCA(n_components = n_components)),
                ('model', model)
            ])
        
        # Without PCA
        else:
        
            # Creata a pipeline with scaler, pca and model
            pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('model', model)
            ])
        

        # Train the model (with pipeline)
        pipeline.fit(X_train, y_train)

        # Prediction
        y_pred = pipeline.predict(X_test)

        # Create classification for accuracy for each label
        reportLabel = classification_report(y_test, y_pred, output_dict=True)

        # Extract accuracy for each label
        accuracy_per_class = {str(int(cls)): metrics['precision'] for cls, metrics in reportLabel.items() if cls.isdigit()}

        # Print accuracy_per_class
        print ("accuracy_per_class", accuracy_per_class)

        # Print line
        print(60 * '-')

        # Convert report to pandas format and save accuracy acc file
        acc_report = pd.DataFrame([accuracy_per_class]).transpose()
        acc_report.to_csv(PathResult + Scenario + '_' + model_name + FileExtension + '.acc', index=True)

        # Create report for classifikation
        reportClassification = classification_report(y_test, y_pred)

        # Create Report F1-Score
        reportF1Score = accuracy_score(y_test, y_pred)

        # Print reports
        print(f'\nClassification Report:\n{reportClassification}')
        print(f'\nF1-Score: {reportF1Score}')
        print(60 * '-')

        # Create confusion matrix
        mcm = metrics.confusion_matrix(y_test, y_pred, normalize='all')

        # Convert report to pandas format and save confusionmatrix
        cm_report = pd.DataFrame(mcm).transpose()
        cm_report.to_csv(PathResult + Scenario + '_' + model_name + FileExtension + '.cm', index=True)

        # Plot confusion matrix
        sns.heatmap(pd.DataFrame(mcm), annot=True, cmap="YlGnBu")
        plt.title('Konfusionsmatrix - ' + str(Scenario) + ' - ' + str(model_name) + ' - PCA' , y=1.1)
        plt.ylabel('Prognostizierte Klasse')
        plt.xlabel('Tatsächliche Klasse')
        plt.tight_layout()

        # Save confusion matrix
        plt.savefig(PathResult + Scenario + '_' + model_name + FileExtension + '_cm.png', pad_inches=5)

        # Show confusion matrix
        plt.show()

        print (80 * '=')
        print()

        # Convert report to pandas format
        clsfs_report = pd.DataFrame(classification_report(y_true = y_test, y_pred = y_pred, output_dict=True)).transpose()

        # Save report to file
        clsfs_report.to_csv(PathResult + Scenario + '_' + model_name + FileExtension + '.csv', index=True)
    
    # Increase counter for hyperparameter row
    CounterHp += CounterHp

    