In [None]:
# Decision Tree Classification template
import json # will be needed for saving preprocessing details
import numpy as np # for data manipulation
import pandas as pd # for data manipulation
from sklearn.model_selection import train_test_split # will be used for data split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder # for preprocessing
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE 
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from numpy import mean
from numpy import std
from matplotlib import pyplot

In [None]:
# load dataset
df = pd.read_csv('Data_without-72ROWS.csv')
array = df.values
print('Shape:', df.shape)
# set input matrix and target column
X = array[:, :-1]
y = array[:, 6]
# show first row of data
print(df.head())
print(df.describe())

In [None]:
# data split train/test
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state=1234)

In [None]:
# taking care of the missing data
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean', verbose = 0)
imputer = imputer.fit(X[:, 1:6]) #upper bound is not included, but lower bound
X[:, 1:6] = imputer.transform(X[:, 1:6])

In [None]:
# encoding the dependent variable
labelencoder_Y = LabelEncoder()
y = labelencoder_Y.fit_transform(y)

In [None]:
# rescale data (between 0 and 1)
scaler = MinMaxScaler(feature_range=(0,1))
X = scaler.fit_transform(X)

In [None]:
# summarize the class distribution
target = df.values[:, -1]
counter = Counter(target)
for k,v in counter.items():
    per = v / len(target) * 100
    print('Class=%s, Count=%s, Percentage=%.3f%%' % (k, v, per))

In [None]:
# Re-summarize class distribution
print(X.shape, y.shape,Counter(y))

In [None]:
# Implementing SMOTE for the Imbalanced data in Multi-class classification
smote=SMOTE("minority")
X,y=smote.fit_resample(X,y)

In [None]:
print(X.shape, y.shape, Counter(y))

In [None]:
# To balance another minority class
smote=SMOTE("minority")
X,y=smote.fit_resample(X,y)


In [None]:
# Re-summarize class distribution
print(X.shape, y.shape,Counter(y))

In [None]:
# Feature Scaling - 
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# Fitting decision tree classifier to the Training set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0, max_depth = 2)
classifier.fit(X_train, y_train)

In [None]:
# Predicting the Test set results
#y_pred = classifier.predict(X_test)

In [None]:
# Making the Confusion Matrix
#from sklearn.metrics import confusion_matrix
#cm = confusion_matrix(y_test, y_pred)

In [None]:
score = classifier.score(X_test, y_test)
print(score)
predictions = classifier.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

print("Classification Report")
print(classification_report(y_test, predictions))