In [None]:
import json # will be needed for saving preprocessing details
import numpy as np # for data manipulation
import pandas as pd # for data manipulation
from sklearn.model_selection import train_test_split # will be used for data split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder # for preprocessing
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE 
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier # for training the algorithm
from sklearn.ensemble import ExtraTreesClassifier # for training the algorithm
import joblib # for saving algorithm and preprocessing objects
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std
from matplotlib import pyplot

In [None]:
# load dataset
df = pd.read_csv('Data_without-columns (copy).csv')
array = df.values
print('Shape:', df.shape)
# set input matrix and target column
X = array[:, :-1]
y = array[:, 6]
# show first row of data
print(df.head())
print(df.describe())

In [None]:
# data split train/test
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state=1234)

In [None]:
# taking care of the missing data
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean', verbose = 0)
imputer = imputer.fit(X[:, 1:6]) #upper bound is not included, but lower bound
X[:, 1:6] = imputer.transform(X[:, 1:6])

In [None]:
# encoding the dependent variable
labelencoder_Y = LabelEncoder()
y = labelencoder_Y.fit_transform(y)

In [None]:
# rescale data (between 0 and 1)
scaler = MinMaxScaler(feature_range=(0,1))
rescaledX = scaler.fit_transform(X)

In [None]:
# summarize the class distribution
target = df.values[:, -1]
counter = Counter(target)
for k,v in counter.items():
    per = v / len(target) * 100
    print('Class=%s, Count=%s, Percentage=%.3f%%' % (k, v, per))

In [None]:
# Re-summarize class distribution
print(X.shape, y.shape,Counter(y))

In [None]:
# Implementing SMOTE for the Imbalanced data in Multi-class classification
smote=SMOTE("minority")
X,y=smote.fit_resample(X,y)

In [None]:
print(X.shape, y.shape, Counter(y))

In [None]:
# To balance another minority class
smote=SMOTE("minority")
X,y=smote.fit_resample(X,y)


In [None]:
# Re-summarize class distribution
print(X.shape, y.shape,Counter(y))

In [None]:
# Feature Scaling - 
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# get a list of models to evaluate
def get_models():
    models = dict()
    models['1'] = RandomForestClassifier(max_features=1)
    models['2'] = RandomForestClassifier(max_features=2)
    models['3'] = RandomForestClassifier(max_features=3)
    models['4'] = RandomForestClassifier(max_features=4)
    models['5'] = RandomForestClassifier(max_features=5)
    models['6'] = RandomForestClassifier(max_features=6)
    models['7'] = RandomForestClassifier(max_features=7)
    models['8'] = RandomForestClassifier(max_features=8)
    return models  

In [None]:
# evaluate a give model using cross-validation
from sklearn.model_selection import cross_val_score
def evaluate_model(model):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

In [None]:
# get the models to evaluate
models = get_models()

In [None]:
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model)
    results.append(scores)
    names.append(name)
    print('%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

In [None]:
# plot model performance for comparison
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.xticks(rotation=45)
pyplot.show()

In [None]:
model.fit(X_train, y_train)

In [None]:
# Predicting the Test set results
y_pred = model.predict(X_test)

In [None]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)