# Importing all the necessary modules

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max.columns', None)

from scipy.stats import skew
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_curve, average_precision_score,auc

from copy import copy, deepcopy

# Importing the Dataset

In [3]:
data_org = pd.read_csv('parkinsons.csv')
data_org.shape

(195, 23)

# Check_Null_Drop Function

In [4]:
def check_null_drop(data):
    dim = list(data.shape)
    print("Initial no. of rows : ",dim[0])
    null_list = list(data.isnull().sum())
    null_list_name = list(data.isnull())
    print("The Columns having null values : ")
    for i in range(len(null_list)):
        if null_list[i] != 0:
            temp = null_list_name[i]
            print(temp)
    print("Dropping the rows having null values!")
    data = data.dropna()
    data = data.reset_index(drop=True)
    dim = list(data.shape)
    print("No. of rows after dropping the NaN values : ",dim[0])
    return data

In [5]:
data2 = check_null_drop(data_org)
data2.shape

Initial no. of rows :  195
The Columns having null values : 
Dropping the rows having null values!
No. of rows after dropping the NaN values :  195


(195, 23)

# Data_Distribution Function

In [6]:
def distribution_plot(data):
    print("The Data Distribution is plotted and saved as .png file")
    data.hist(figsize=(25,16))
    plt.savefig('dist_plot.png')
    plt.close()

In [7]:
distribution_plot(data2)
data2.shape

The Data Distribution is plotted and saved as .png file


(195, 23)

# Skewness_Check Function

In [8]:
def skewness_square_removal(data,skewed_col_name):
    for name in skewed_col_name:
        data[name] = np.square(data[name])
    return data

def skewness_sqrt_removal(data,skewed_col_name):
    for name in skewed_col_name:
        data[name] = np.sqrt(data[name])
    return data

def skewness_check_removal(data):
    data = data.iloc[: , :-1]
    column_name = list(data.columns)
    skewed_col_name = []
    for name in column_name:
        skew_val = data[name].skew()
        if(skew_val>1.5 or skew_val<-1.5):
            #print(name,":",skew_val)
            skewed_col_name.append(name)

    #print(len(skewed_col_name))

    data_sq_rem = deepcopy(data)
    data_sqrt_rem = deepcopy(data)
    data_sq_rem = skewness_square_removal(data_sq_rem,skewed_col_name)
    data_sqrt_rem = skewness_sqrt_removal(data_sqrt_rem,skewed_col_name)
    col_sq = list(data_sq_rem.columns)
    col_sqrt = list(data_sqrt_rem.columns)

    col_sq_temp,col_sqrt_temp = [],[]
    
    for name in skewed_col_name:
        t1 = data_sq_rem[name].skew()
        t2 = data_sqrt_rem[name].skew()
        if(t1>1.5 or t1<-1.5):
            col_sq_temp.append(t1)
        if(t2>1.5 or t2<-1.5):
            col_sqrt_temp.append(t2)
    
    #print(len(col_sq_temp))
    #print(len(col_sqrt_temp))

    if(len(col_sq_temp) < len(col_sqrt_temp)):
        print("Skewness of parameters reduced by squaring each value.")
        #print(col_sq_temp)
        return data_sq_rem
    elif(len(col_sqrt_temp) < len(col_sq_temp)):
        print("Skewness of parameters reduced by squareroot each value.")
        #print(col_sqrt_temp)
        return data_sqrt_rem
    else:
        print("Skewness of parameters reduced by squaring each value.")
        #print(col_sq_temp)
        return data_sq_rem
    

In [9]:
data3_without_output = skewness_check_removal(data2)
print(data3_without_output.shape)
print(data2.shape)

Skewness of parameters reduced by squareroot each value.
(195, 22)
(195, 23)


# Split Into Training and Testing Data Function

In [10]:
def train_test_split_function(data_without_output, data_with_output):
    real_x = data3_without_output.values
    real_y = data_with_output.iloc[:,-1].values

    std_scl = StandardScaler()

    real_x = std_scl.fit_transform(real_x)

    x_train, x_test, y_train, y_test = train_test_split(real_x, real_y, test_size=0.15)

    return(x_train, x_test, y_train, y_test)

In [11]:
x_train, x_test, y_train, y_test = train_test_split_function(data3_without_output, data2)
print(x_test)
print(y_test)

[[-1.39682903e-01 -3.84529875e-01  5.91420424e-01 -4.91924116e-01
  -3.36957712e-01 -4.58891493e-01 -3.89204788e-01 -4.61002475e-01
  -6.11532097e-01 -6.45836631e-01 -6.05543798e-01 -5.73132839e-01
  -5.06896826e-01 -6.06027794e-01 -6.19709245e-01  2.22006454e-01
  -8.67496164e-01  9.19414671e-01 -1.80762350e-02 -1.23886852e-01
   7.78801095e-01 -9.83629174e-02]
 [ 1.18582492e+00  2.81444285e-01  1.83911459e+00 -1.34555333e+00
  -1.43701976e+00 -1.18237307e+00 -1.24729352e+00 -1.17995872e+00
  -1.37602985e+00 -1.34909103e+00 -1.39816646e+00 -1.25313655e+00
  -1.36825003e+00 -1.39885880e+00 -1.25939941e+00  2.52832133e+00
  -1.52850505e+00  4.31207320e-01 -2.09726768e+00 -7.57183759e-01
  -2.51047222e+00 -1.80238442e+00]
 [-1.05343115e+00 -1.11564550e+00 -3.72857670e-01 -6.28969703e-01
  -3.36957712e-01 -6.46959036e-01 -5.91633839e-01 -6.47218143e-01
  -8.87110683e-01 -8.90572659e-01 -8.95637013e-01 -7.63985926e-01
  -7.56740031e-01 -8.96181864e-01 -7.88473505e-01  1.05654766e+00
  -1.2

# K-Nearest Neighbour Function

In [12]:
def KNN(x_train, x_test, y_train, y_test):
    knn = KNeighborsClassifier()
    knn.fit(x_train, y_train)
    accuracy = knn.score(x_test, y_test)
    y_predict_knn = knn.predict(x_test)
    confusion_matrix_knn = confusion_matrix(y_test, y_predict_knn)

    print("KNN Model Accuracy : ",accuracy)
    print("\nConfusion Matrix : \n",confusion_matrix_knn)

In [13]:
KNN(x_train, x_test, y_train, y_test)

KNN Model Accuracy :  0.8666666666666667

Confusion Matrix : 
 [[ 6  2]
 [ 2 20]]


Support vector classifier tester.

In [18]:
from sklearn.svm import SVC
def sv_classifier(x_train,x_test,y_train,y_test):
    linear_model=SVC(kernel='linear',C=1).fit(x_train,y_train)
    predictor=linear_model.predict(x_test)
    acc=linear_model.score(x_test,y_test)
    confusion_matrix_svc = confusion_matrix(y_test,predictor)

    print("Accuracy of the model=",acc*100,"%")
    print("The confusion matrix: \n",confusion_matrix_svc)

In [19]:
sv_classifier(x_train, x_test, y_train, y_test)

Accuracy of the model= 86.66666666666667 %
The confusion matrix: 
 [[ 5  3]
 [ 1 21]]


KNN classifier test.

In [20]:
from sklearn.neighbors import KNeighborsClassifier
def knn_classifier(x_train, x_test, y_train, y_test):
    knn_model=KNeighborsClassifier(n_neighbors=3).fit(x_train,y_train)
    pred=knn_model.predict(x_test)
    acc=knn_model.score(x_test,y_test)
    confusion_matrix_knn = confusion_matrix(y_test,pred)

    print("Accuracy of the model=",acc*100,"%")
    print("The confusion matrix: \n",confusion_matrix_knn)

In [21]:
knn_classifier(x_train, x_test, y_train, y_test)

Accuracy of the model= 86.66666666666667 %
The confusion matrix: 
 [[ 6  2]
 [ 2 20]]


Decision Tree classifier test.

In [22]:
from sklearn.tree import DecisionTreeClassifier
def dtr_classifier(x_train, x_test, y_train, y_test):
    model=DecisionTreeClassifier(criterion='entropy', max_depth=3).fit(x_train,y_train)
    pred=model.predict(x_test)
    # pred_arr=[y[i] for i in pred]
    # print(pred_arr)
    acc=model.score(x_test,y_test)
    confusion_matrix_dtrc = confusion_matrix(y_test,pred)
    print("Decision tree model accuracy=",acc*100)
    print("The Confusion matrix: \n",confusion_matrix_dtrc)

In [24]:
dtr_classifier(x_train, x_test, y_train, y_test)

Decision tree model accuracy= 90.0
The Confusion matrix: 
 [[ 7  1]
 [ 2 20]]


Random forest classifier test.

In [27]:
from sklearn.ensemble import RandomForestClassifier
def rf_classifier(x_train, x_test, y_train, y_test):
    rfc = RandomForestClassifier()
    rfc.fit(x_train, y_train)
    acc = rfc.score(x_test,y_test)
    y_predict_rfc = rfc.predict(x_test)
    confusion_matrix_rfc = confusion_matrix(y_test,y_predict_rfc)

    print("Random Forest Classifier : ",acc*100)
    print("\nConfusion Matrix : \n",confusion_matrix_rfc)

In [28]:
rf_classifier(x_train, x_test, y_train, y_test)

Random Forest Classifier :  93.33333333333333

Confusion Matrix : 
 [[ 6  2]
 [ 0 22]]
