In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns

 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.preprocessing import PowerTransformer
 


import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

*Constants*

In [None]:
state_random = 7

In [None]:
# 2. Import data 2C_weka.csv for 2 Class Classification.

missing_value_formats = ["n.a.","?","NA","n/a","na","--"," ", "  "]
TwoC_weka_data = pd.read_csv('/kaggle/input/column-2c-wekacsv/column_2C_weka.csv', na_values = missing_value_formats)

# Basics Of Data

In [None]:
TwoC_weka_data

In [None]:
TwoC_weka_data.info()

*There are 6 columns. 5 are of type numerical & 1 categorical*

In [None]:
TwoC_weka_data.describe()

*Except "degree_spondylolisthesis", rest all of the columns have distribution close to normal. degree_spondylolisthesis seems to be right-tailed or positively skewed.*

In [None]:
TwoC_weka_data.shape

*The 2C_weka has 310 rows & 7 columns*

In [None]:
# Checking top 5 rows
TwoC_weka_data.head()

In [None]:
# Checking last 5 rows
TwoC_weka_data.tail()

In [None]:
# Checking for skewness
TwoC_weka_data.skew()

*As mentioned above "degree_spondylolisthesis" is positively skewed.*

*Checking for null values and duplicate data*

In [None]:
TwoC_weka_data.isna().sum()

In [None]:
TwoC_weka_data.isnull().sum()

In [None]:
TwoC_weka_data.duplicated().sum()

*There are no null values and there is no duplicate data.*

# Univariate Analysis

*Checking for unique values in target variable "class"*

In [None]:
print(TwoC_weka_data['class'].unique())

*Count of each class*

In [None]:
# Method 1
pd.crosstab(TwoC_weka_data['class'],columns='Count')

In [None]:
# Method 2
print(TwoC_weka_data['class'].value_counts())

In [None]:
# Method 3
sns.countplot(x='class',data=TwoC_weka_data)
plt.show()

*Records for Abnormal class are more than Normal class*

*Identifying Type Of Features*

*Numerical Features & Categorical Features*

In [None]:
numerical_features = TwoC_weka_data.select_dtypes(include = [np.number])
print(numerical_features.columns)

*Now we want to segregate discrete variables from continuous variables. So, we count the number of unique values in each feature. If count of unique values is less than 25 then we consider it as discrete variable otherwise it is a continuous variable*

In [None]:
continuous_numerical_features = []
discrete_numerical_features = []

In [None]:
for feature in numerical_features:
    if(len(TwoC_weka_data[feature].unique())>25):
        continuous_numerical_features.append(feature)
    else:
        discrete_numerical_features.append(feature)

In [None]:
print(continuous_numerical_features)

In [None]:
print(discrete_numerical_features)

*All the independent features are continuous*

In [None]:
for feature in numerical_features.columns:
    sns.displot(numerical_features[feature],kde=True)
    plt.show()

*Features "pelvic_tilt" & "pelvic_radius" are very close to normal. Features "pelvic_incidence", "lumbar_lordosis_angle" & "sacral_slope" have some kind of uniform distribution, but still they look a bit positively skewed. Feature degree_spondylolisthesis is highly positively skewed.*

*Now looking for IQR & Outliers*

In [None]:
for feature in numerical_features.columns:
    sns.boxplot(TwoC_weka_data[feature])
    plt.show()

*There are outliers in all of the features.*

*Plotting Barplot. Showing the numbers*

In [None]:
for feature in numerical_features.columns:
    sns.barplot(x='class',y=feature,data=TwoC_weka_data)
    plt.show()

*Class "Abnormal" has more count as compare to Normal for alsmot all of the features*

# Bivariate Analysis

In [None]:
plt.figure(figsize=(5,5))
sns.heatmap(TwoC_weka_data.corr(),annot=True)
plt.show()

*It looks like there is some multicolinearity here. For example: Feature "pelvic_incidence" seems to be correlated with all the other features except "pelvic_radius".*

*Plotting swarmplot*

In [None]:
for feature in numerical_features.columns:
    sns.swarmplot(data = TwoC_weka_data,y = feature, x = TwoC_weka_data['class'] )
    plt.show()

*Swarmplot shows distributon of each feature for both the classes.*

*For pelvic_incidence, majority of abnormal class points clustered between 40 & 95*

*For pelvic_tilt numeric, majority of abnormal class points clustered between 10 & 25*

*For lumbar_lordosis_angle, majority of abnormal class points clustered between 35 & 80*

*For sacral_slope, majority of abnormal class points clustered between 40 & 60*

*For pelvic_radius, majority of abnormal class points clustered between 100 & 130*

*Plotting pairplot*

In [None]:
sns.pairplot(TwoC_weka_data,size=3,hue='class')
plt.show()

*There seems to be good separation between normal and abnormal for all the features except "degree_spondylolisthesis" as it is highly skewed.*

*Good separation indicates that, that particular feature could be a good indicator.*

*As mentioned earlier, the independent features are related to each other.*

*For example: Feature 'pelvic_incidence' seems linearly related to 'pelvic_tilt_numeric' & 'lumbar_lordosis_angle' & 'sacral_slope'*

# Preprocessing Dataset

## Splitting Dataset

*First splitting the data into train and test to avoid data leak*

In [None]:
X = TwoC_weka_data.iloc[:,0:6]
print(X.shape)
X.head()

In [None]:
X.columns

In [None]:
y = TwoC_weka_data.iloc[:,6:]
print(y.shape)
y.head()

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=state_random)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

## Transforming data 

*Now, log/root transformation is applied because the features are positively skewed. If the features were negatively skewed, power transform would have been applied.*

In [None]:
power = PowerTransformer(method='yeo-johnson', standardize=True)
power.fit(X_train)
X_train_transformed = power.transform(X_train)
X_test_transformed = power.transform(X_test)

## Scaling data 

In [None]:
std_scaler = StandardScaler()
std_scaler.fit(X_train_transformed)
X_train_scaled_transformed = std_scaler.transform(X_train_transformed)
X_test_scaled_transformed  = std_scaler.transform(X_test_transformed)
    

*Converting train and test scaled data into a dataframe to visualize*

In [None]:
X_train_scaled_transformed_dataframe = pd.DataFrame(X_train_scaled_transformed,columns=X.columns)
X_test_scaled_transformed_dataframe = pd.DataFrame(X_test_scaled_transformed,columns=X.columns)

In [None]:
X_train_scaled_transformed_dataframe.head(10)

In [None]:
X_test_scaled_transformed_dataframe.head(10)

In [None]:
X_train_scaled_transformed_dataframe.isna().sum()

In [None]:
X_test_scaled_transformed_dataframe.isna().sum()

In [None]:
for feature in X_train_scaled_transformed_dataframe.columns:
    sns.displot(X_train_scaled_transformed_dataframe[feature],kde=True)
    plt.show()

In [None]:
for feature in X_test_scaled_transformed_dataframe.columns:
    sns.displot(X_test_scaled_transformed_dataframe[feature],kde=True)
    plt.show()

# Modelling

## Logistic Regression

In [None]:
logistic_regression = LogisticRegression()
logistic_regression.fit(X_train_scaled_transformed,y_train)
LR_y_predict = logistic_regression.predict(X_test_scaled_transformed)
conf_matrix = metrics.confusion_matrix(y_test,LR_y_predict)
class_report = metrics.classification_report(y_test,LR_y_predict)
test_score = np.round(logistic_regression.score(X_test_scaled_transformed,y_test),2)
train_score = np.round(logistic_regression.score(X_train_scaled_transformed,y_train),2)
print(conf_matrix)
print(class_report)
print(train_score)
print(test_score)

## Gaussian Naive Bayes

In [None]:
gnb = GaussianNB()
gnb.fit(X_train_scaled_transformed,y_train)
GNB_y_pred = gnb.predict(X_test_scaled_transformed)
conf_matrix = metrics.confusion_matrix(y_test,GNB_y_pred)
class_report = metrics.classification_report(y_test,GNB_y_pred)
test_score = np.round(gnb.score(X_test_scaled_transformed,y_test),2)
train_score = np.round(gnb.score(X_train_scaled_transformed,y_train),2)
print(conf_matrix)
print(class_report)
print(train_score)
print(test_score)

## KNN Classifier

In [None]:
KNN = KNeighborsClassifier(n_neighbors=5,metric='minkowski',algorithm='auto',p=2) 
KNN.fit(X_train_scaled_transformed,y_train)
KNN_y_pred = KNN.predict(X_test_scaled_transformed)
conf_matrix = metrics.confusion_matrix(y_test,KNN_y_pred)
class_report = metrics.classification_report(y_test,KNN_y_pred)
test_score = np.round(KNN.score(X_test_scaled_transformed,y_test),2)
train_score = np.round(KNN.score(X_train_scaled_transformed,y_train),2)
print(conf_matrix)
print(class_report)
print(train_score)
print(test_score)

*n_neighbors is a hyper parameter*

# Final Words...

### In disease classification, the aim is to minimize False Negatives while keeping the testing precision and recall as high as possible. 