In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_excel (r'data.xlsx')
print (df.columns)
df.head()


Index(['ID', 'Gender', 'Nationality', 'Major', 'Level', 'IE1', 'SMSK3', 'RAS1',
       'RAS2', 'SMSK1', 'SMSK4', 'IE2', 'TL1', 'RAS3', 'IE3', 'RAS4', 'RAS5',
       'IE4', 'SMSK2', 'TL2', 'TL3', 'PSD1', 'PSD2', 'PSD3', 'IE5', 'PSD4',
       'PSD5', 'IM1', 'IM2', 'IM3', 'IM4', 'IM5', 'IM6', 'W1', 'W2', 'W3',
       'Employed', 'Score', 'Class'],
      dtype='object')


Unnamed: 0,ID,Gender,Nationality,Major,Level,IE1,SMSK3,RAS1,RAS2,SMSK1,...,IM3,IM4,IM5,IM6,W1,W2,W3,Employed,Score,Class
0,1,0,1,0,2,4,3,4,4,2,...,4,4,4,4,4.0,4.0,4.0,1.0,3.2,1
1,2,0,0,0,2,4,4,4,4,3,...,4,3,4,4,3.0,4.0,3.0,0.0,3.82,1
2,3,0,1,0,1,3,2,3,4,3,...,3,4,4,3,4.0,4.0,2.0,1.0,3.75,1
3,4,1,1,1,3,4,4,4,4,4,...,4,4,4,4,3.0,4.0,4.0,1.0,3.7,1
4,5,0,0,0,2,4,3,3,4,3,...,4,3,3,3,4.0,4.0,2.0,1.0,3.82,0


In [2]:
print("dimension of diabetes data: {}".format(df.shape))
print(df.groupby('Class').size())


dimension of diabetes data: (260, 39)
Class
0    133
1    127
dtype: int64


In [3]:
def preProcessing(df):
    df = df.dropna(axis = 1)
    df.head()
    return df
df = preProcessing(df)
df.head()

Unnamed: 0,ID,Gender,Nationality,Major,Level,IE1,SMSK3,RAS1,RAS2,SMSK1,...,PSD4,PSD5,IM1,IM2,IM3,IM4,IM5,IM6,Score,Class
0,1,0,1,0,2,4,3,4,4,2,...,3,3,4,4,4,4,4,4,3.2,1
1,2,0,0,0,2,4,4,4,4,3,...,4,3,3,4,4,3,4,4,3.82,1
2,3,0,1,0,1,3,2,3,4,3,...,3,3,4,4,3,4,4,3,3.75,1
3,4,1,1,1,3,4,4,4,4,4,...,4,4,4,4,4,4,4,4,3.7,1
4,5,0,0,0,2,4,3,3,4,3,...,3,4,3,3,4,3,3,3,3.82,0


In [4]:
from sklearn.model_selection import train_test_split

X = df.loc[:, df.columns != 'Class']
y = df['Class']


X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y,
    test_size=0.33,
    stratify=df['Class'], # data is split in a stratified fashion, using this as the class labels.
    random_state=66)#Controls the shuffling applied to the data before applying the split

In [5]:
from sklearn.tree import DecisionTreeClassifier
#clf = svm.SVC(kernel="linear") fy course 3
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

Accuracy on training set: 1.000
Accuracy on test set: 0.942


In [6]:
"""
The accuracy on the training set is 100%, while the test set accuracy is much worse. 
This is an indicative that the tree is overfitting and not generalizing well to new data. 
Therefore, we need to apply pre-pruning to the tree.

We set max_depth=3, limiting the depth of the tree decreases overfitting. 
This leads to a lower accuracy on the training set, but an improvement on the test set.

"""
from sklearn.metrics import classification_report
#clf = svm.SVC(kernel="linear") fy course 3
tree = DecisionTreeClassifier(max_depth=3, random_state=0)
tree.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

y_true, y_pred = y_test, tree.predict(X_test)
print(classification_report(y_true, y_pred))
print()

Accuracy on training set: 0.954
Accuracy on test set: 0.895
              precision    recall  f1-score   support

           0       0.93      0.86      0.89        44
           1       0.87      0.93      0.90        42

    accuracy                           0.90        86
   macro avg       0.90      0.90      0.90        86
weighted avg       0.90      0.90      0.90        86




In [7]:
"""
#Feature Importance in Decision Trees
Feature importance rates how important each feature is for the decision a tree makes. 
It is a number between 0 and 1 for each feature, where 0 means “not used at all” and 1 means “
perfectly predicts the target”. 
The feature importances always sum to 1:
"""
print("Feature importances:\n{}".format(tree.feature_importances_))

Feature importances:
[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.133403   0.
 0.         0.73878644 0.12781056 0.        ]
