In [1]:
import pandas as pd
import numpy as np

In [2]:
cols = ["variance","skewness","curtosis","entropy","class"]
dataset = pd.read_excel("2_data.xlsm", names=cols)


In [3]:
# cleaning data which have space.
def dataclean(d_set):
    for x in d_set:
        data_without_str = []
        for i in d_set[x]:
            if i != "  ":
              data_without_str.append(i)

        data_without_str_ds = pd.Series(data_without_str)
        mode_class = np.array(data_without_str_ds.value_counts().keys())[0]
        d_set[x] = d_set[x].replace("  ",mode_class)


dataclean(dataset)
from sklearn.preprocessing import StandardScaler
StandardScaler(dataset[:-1])  # standardizing all columns except class label


StandardScaler(copy=      variance   skewness   curtosis   entropy  class
0     4.545900   8.167400  -2.458600 -1.462100      0
1     3.866000  -2.638300   1.924200  0.106450      0
2     3.456600   9.522800  -4.011200 -3.594400      0
3     0.329240  -4.455200   4.571800 -0.988800      0
4     4.368400   9.671800...2100 -2.777100      1
1369 -3.563700  -8.382700  12.393000 -1.282300      1

[1370 rows x 5 columns],
        with_mean=True, with_std=True)

In [4]:
# splitting data in train and test
predictors = dataset[['variance', 'skewness', 'curtosis', 'entropy']]
label = dataset['class']

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictors, label, random_state=12, test_size=0.20)


In [5]:
# model development and training
from sklearn import tree
cart_clf_model = tree.DecisionTreeRegressor()
c45_clf_model = tree.DecisionTreeClassifier()
id3_clf_model = tree.DecisionTreeClassifier(criterion="entropy")

cart_clf_model.fit(x_train, y_train)
c45_clf_model.fit(x_train, y_train)
id3_clf_model.fit(x_train, y_train)


DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [6]:
# accuracy and scoring
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, classification_report
from sklearn.model_selection import cross_val_score


In [7]:
# model ranking according tp their performance criteria
def modelRange(model_acc_data):
    model_df = pd.DataFrame(model_acc_data)

    print("================ > Model Ranking < =====================")
    # based on Accuracy Score
    print("------------------ Based on Accuracy Score ------------------------")
    print(model_df[['model name', 'Accuracy Score', 'Roc Auc', 'Cross Validation Mean Score']].sort_values("Accuracy Score", ascending=False))

    # based on ROC AUC
    print("------------------ Based on ROC AUC Score ------------------------")
    print(model_df[['model name', 'Accuracy Score', 'Roc Auc' ,'Cross Validation Mean Score']].sort_values("Roc Auc",ascending=False))

    # based on Cross Validation Mean Score
    print("------------------ Based on Cross Validation Mean Score ------------------------")
    print(model_df[['model name', 'Accuracy Score', 'Roc Auc', 'Cross Validation Mean Score']].sort_values("Cross Validation Mean Score",ascending=False))



In [8]:
# each model performance measuring
def getmodelAccuracyReport(model_list, predictors, label):
    print("================ > Model Accuracy < =====================")
    model_matrix = []
    for each_model in model_list:
        model_performance_metrix = {}
        model_name = each_model['name']
        model = each_model['model']

        # prediction
        print("=============================================================")
        model_predict = model.predict(x_test)
        model_performance_metrix['model name'] = model_name
        print("Model Name : ", model_name)
        print("-------------------------------------------------------------")
        acu_score = accuracy_score(y_test, model_predict)
        print("Accuracy Score : ",acu_score)
        model_performance_metrix['Accuracy Score'] = acu_score

        roc_auc = roc_auc_score(y_test, model_predict)
        print("ROC AUC Score : ", roc_auc)
        model_performance_metrix['Roc Auc'] = roc_auc

        con_mat = confusion_matrix(y_test, model_predict)
        print("Confusion Matrix \n",con_mat)
        model_performance_metrix['Confusion Matrix'] = con_mat

        clf_report = classification_report(y_test, model_predict)
        print("Classification Report \n", clf_report)
        model_performance_metrix['Classification Report'] = clf_report


        scores = cross_val_score(model, predictors, label, cv=10)
        mean_score = np.mean(scores)
        print("Cross Validation Score : ", scores, "Mean Score : ", mean_score)
        model_performance_metrix['Cross Validation Mean Score'] = mean_score

        model_matrix.append(model_performance_metrix)

    return model_matrix



In [9]:
# All Trained models
models = [
    {
        "name" : "CART",
        "model" : cart_clf_model
    },
    {
        "name": "C4.5",
        "model": c45_clf_model
    },
    {
        "name": "ID3",
        "model": id3_clf_model
    },

]

In [10]:
# calling model performance
model_acc_data = getmodelAccuracyReport(models, x_train, y_train)

# calling the model ranking
modelRange(model_acc_data)

Model Name :  CART
-------------------------------------------------------------
Accuracy Score :  0.9854545454545455
ROC AUC Score :  0.986
Confusion Matrix 
 [[147   3]
 [  1 124]]
Classification Report 
               precision    recall  f1-score   support

           0       0.99      0.98      0.99       150
           1       0.98      0.99      0.98       125

   micro avg       0.99      0.99      0.99       275
   macro avg       0.98      0.99      0.99       275
weighted avg       0.99      0.99      0.99       275

Cross Validation Score :  [0.92570078 0.92570078 0.85377202 0.92717643 0.88214286 1.
 0.81525424 1.         0.81575389 0.96188811] Mean Score :  0.910738909653696
Model Name :  C4.5
-------------------------------------------------------------
Accuracy Score :  0.9890909090909091
ROC AUC Score :  0.9893333333333334
Confusion Matrix 
 [[148   2]
 [  1 124]]
Classification Report 
               precision    recall  f1-score   support

           0       0.99     