In [1]:
import json # will be needed for saving preprocessing details
import numpy as np # for data manipulation
import pandas as pd # for data manipulation
from sklearn.model_selection import train_test_split # will be used for data split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder # for preprocessing
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE 
from collections import Counter
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier # for training the algorithm
from sklearn.ensemble import ExtraTreesClassifier # for training the algorithm
import joblib # for saving algorithm and preprocessing objects
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from numpy import mean
from numpy import std
from matplotlib import pyplot

In [2]:
# load dataset
df = pd.read_csv('Data.csv')
array = df.values
X = array[:, :-1]
y = array[:, 8]

In [3]:
# taking care of the missing data
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean', verbose = 0)
imputer = imputer.fit(X[:, 1:]) #upper bound is not included, but lower bound
X[:, 1:8] = imputer.transform(X[:, 1:8])

In [4]:
# rescale data (between 0 and 1)
scaler = MinMaxScaler(feature_range=(0,1))
X = scaler.fit_transform(X)

In [5]:
# data split train/test
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state=1234)

In [6]:
# encoding the dependent variable
labelencoder_Y = LabelEncoder()
y = labelencoder_Y.fit_transform(y)

In [7]:
# Implementing SMOTE for the Imbalanced data in Multi-class classification
smote=SMOTE("minority")
X,y=smote.fit_resample(X,y)



In [8]:
# To balance another minority class
smote=SMOTE("minority")
X,y=smote.fit_resample(X,y)




In [9]:
# Feature Scaling - 
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [10]:
# get a list of models to evaluate
def get_models():
    models = dict()
    models['1'] = RandomForestClassifier(max_features=1)
    models['2'] = RandomForestClassifier(max_features=2)
    models['3'] = RandomForestClassifier(max_features=3)
    models['4'] = RandomForestClassifier(max_features=4)
    models['5'] = RandomForestClassifier(max_features=5)
    models['6'] = RandomForestClassifier(max_features=6)
    models['7'] = RandomForestClassifier(max_features=7)
    models['8'] = RandomForestClassifier(max_features=8)
    return models  

In [11]:
# evaluate a give model using cross-validation
from sklearn.model_selection import cross_val_score
def evaluate_model(model):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    return scores

In [12]:
# get the models to evaluate
models = get_models()

In [13]:
# evaluate the models and store results
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model)
    results.append(scores)
    names.append(name)
    print('%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

1 0.474 (0.121)
2 0.510 (0.114)
3 0.490 (0.118)
4 0.488 (0.117)
5 0.523 (0.126)
6 0.502 (0.121)
7 0.486 (0.120)
8 0.488 (0.142)


In [14]:
model.fit(X_train, y_train)

RandomForestClassifier(max_features=8)

In [15]:
# Predicting the Test set results
predictions = model.predict(X_test)

In [16]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predictions)

In [17]:
# Evaluate Predictions
print(accuracy_score(y_test, predictions))

0.3409090909090909


In [18]:
# Pickle Model
pd.to_pickle(model, r'.new_randomforest.pickle')

In [19]:
# Unpickle the model
model = pd.read_pickle(r'.new_randomforest.pickle')

In [20]:
#Take inputs from the user
Floor_Num = float(input("Enter Number of Floors: "))
Total_Floor_Area = float(input("Enter Total Floor Area: "))
Column_Area = float(input("Enter Column Area: "))
Concrete_Wall_AreaNS = float(input("Enter Concrete Wall AreaNS: "))
Concrete_Wall_AreaEW = float(input("Enter Concrete_Wall_AreaEW: "))
Masonry_Wall_AreaNS = float(input("Enter Masonry_Wall_AreaNS: "))
Masonry_Wall_AreaEW = float(input("Enter Masonry_Wall_AreaEW: "))
Captive_Columns = bool(input("Enter Captive_Columns: "))

In [21]:
results = model.predict([[Floor_Num, Total_Floor_Area, Column_Area, Concrete_Wall_AreaNS, Concrete_Wall_AreaEW, Masonry_Wall_AreaNS, Masonry_Wall_AreaEW, Captive_Columns]])
print(results)

[1.]
