In [1]:
import numpy as np
import pandas as pd

# Parameters Vector

Load, merge, normalization vector of parameters.

In [2]:
# Loading each csv file
df1 = pd.read_csv('malehealthy.csv')
df2 = pd.read_csv('malepathology.csv')
df3 = pd.read_csv('femalehealthy.csv')
df4 = pd.read_csv('femalepathology.csv')

Concatenating all four DataFrames into one.

In [3]:
allpatients = pd.concat([df1,df2,df3,df4])
allpatients.head()

Unnamed: 0,STD,MEAN,MAX,MIN,RMS,ENERGY,POWER,Status
0,6289.628284,10.762239,29241.0,-29320.0,6289.637492,3142491000000.0,19779650.0,healthy
1,5107.064228,-8.317314,22856.0,-29453.0,5107.071001,2332659000000.0,13041010.0,healthy
2,6599.732595,26.978027,22061.0,-23299.0,6599.787734,1791987000000.0,21778330.0,healthy
3,3353.416152,-16.966091,9884.0,-13521.0,3353.45907,823128100000.0,5622805.0,healthy
4,4251.474562,84.546723,13997.0,-19188.0,4252.315146,1523442000000.0,9041038.0,healthy


Features has diffrent rows, so they have to be normalized.

In [4]:
features = ['STD','MEAN','MAX','MIN','RMS','ENERGY','POWER']
for i in features:
    allpatients[i] = (allpatients[i] - np.mean(allpatients[i]))/np.std(allpatients[i])
allpatients.head()

Unnamed: 0,STD,MEAN,MAX,MIN,RMS,ENERGY,POWER,Status
0,0.427009,0.166092,2.282855,-1.734309,0.411114,0.642207,0.251356,healthy
1,-0.25227,0.131723,1.098892,-1.754664,-0.270637,0.117862,-0.369856,healthy
2,0.605136,0.195303,0.951476,-0.812862,0.589916,-0.232209,0.435609,healthy
3,-1.259587,0.116143,-1.306491,0.683552,-1.281596,-0.85952,-1.053716,healthy
4,-0.743731,0.299006,-0.543822,-0.18372,-0.763404,-0.406085,-0.7386,healthy


Afterwards data have to be suffled, in order to avoid overfitting

In [5]:
allpatients = allpatients.sample(frac=1).reset_index(drop=True)
allpatients.head()

Unnamed: 0,STD,MEAN,MAX,MIN,RMS,ENERGY,POWER,Status
0,-0.463581,0.157359,-0.684377,0.440832,-0.482717,-0.535412,-0.536817,healthy
1,2.161011,0.151043,1.180852,-0.766798,2.151418,1.154339,2.421687,pathology
2,0.164834,0.133889,-0.539372,0.965602,0.147983,0.377179,-0.003686,pathology
3,-1.150112,0.163762,-0.572378,1.469559,-1.171741,-0.612636,-0.993132,healthy
4,-0.542493,0.150808,-0.615583,1.569034,-0.561919,-0.702437,-0.595969,pathology


We'll need to convert categorical features (Status) to dummy variables, because our alghoritm can't understant what means 'healthy'.

In [6]:
status = pd.get_dummies(allpatients['Status'],drop_first=True)

In [7]:
allpatients.drop(['Status'],axis=1,inplace=True)

In [8]:
allpatients['Pathology'] = status
allpatients.head()

Unnamed: 0,STD,MEAN,MAX,MIN,RMS,ENERGY,POWER,Pathology
0,-0.463581,0.157359,-0.684377,0.440832,-0.482717,-0.535412,-0.536817,0
1,2.161011,0.151043,1.180852,-0.766798,2.151418,1.154339,2.421687,1
2,0.164834,0.133889,-0.539372,0.965602,0.147983,0.377179,-0.003686,1
3,-1.150112,0.163762,-0.572378,1.469559,-1.171741,-0.612636,-0.993132,0
4,-0.542493,0.150808,-0.615583,1.569034,-0.561919,-0.702437,-0.595969,1


# Detection alghoritm

Building logistic regression algorithm

# Train test split

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(allpatients.drop(['Pathology'],axis=1), 
                                                    allpatients['Pathology'], 
                                                    test_size=0.30, 
                                                    random_state=42)

# Training and predicting

In [11]:
from sklearn.linear_model import LogisticRegression

In [44]:
logModel = LogisticRegression(max_iter=1000, C = 0.00001)
logModel.fit(X_train, y_train)



LogisticRegression(C=1e-05, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [45]:
prediction = logModel.predict(X_test)

# Evaluation
Check precission, recall, f1-score using classification report

In [46]:
from sklearn.metrics import classification_report

In [47]:
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

           0       0.50      0.31      0.38       212
           1       0.70      0.84      0.76       401

   micro avg       0.65      0.65      0.65       613
   macro avg       0.60      0.57      0.57       613
weighted avg       0.63      0.65      0.63       613

