# Import Datasets and Libiraries 

In [1]:
# Importing needed Libiraries
import numpy as np
import pandas as pd
import random
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
# Importing dataset (train & validation)
train = pd.read_csv('training.csv', sep =";", decimal=',')
validation = pd.read_csv('validation.csv' , sep =";", decimal=',')

# Data Assessing
#### Assess datasets visually and programmatically looking for quality and tidiness issues

<img src="https://media.giphy.com/media/AXorq76Tg3Vte/giphy.gif">

In [3]:
train.head()

Unnamed: 0,variable1,variable2,variable3,variable4,variable5,variable6,variable7,variable8,variable9,variable10,variable11,variable12,variable13,variable14,variable15,variable17,variable18,variable19,classLabel
0,a,17.92,5.4e-05,u,g,c,v,1.75,f,t,1,t,g,80.0,5,800000.0,t,0,no.
1,b,16.92,3.4e-05,y,p,k,v,0.29,f,f,0,f,s,200.0,0,2000000.0,,0,no.
2,b,31.25,0.000112,u,g,ff,ff,0.0,f,t,1,f,g,96.0,19,960000.0,t,0,no.
3,a,48.17,0.000133,u,g,i,o,0.335,f,f,0,f,g,0.0,120,0.0,,0,no.
4,b,32.33,0.00035,u,g,k,v,0.5,f,f,0,t,g,232.0,0,2320000.0,f,0,no.


In [4]:
validation.head()

Unnamed: 0,variable1,variable2,variable3,variable4,variable5,variable6,variable7,variable8,variable9,variable10,variable11,variable12,variable13,variable14,variable15,variable17,variable18,variable19,classLabel
0,b,32.33,0.00075,u,g,e,bb,1.585,t,f,0,t,s,420.0,0,4200000.0,,1,no.
1,b,23.58,0.000179,u,g,c,v,0.54,f,f,0,t,g,136.0,1,1360000.0,,0,no.
2,b,36.42,7.5e-05,y,p,d,v,0.585,f,f,0,f,g,240.0,3,2400000.0,,1,no.
3,b,18.42,0.001041,y,p,aa,v,0.125,t,f,0,f,g,120.0,375,1200000.0,,0,no.
4,b,24.5,0.001334,y,p,aa,v,0.04,f,f,0,t,g,120.0,475,1200000.0,f,1,no.


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3700 entries, 0 to 3699
Data columns (total 19 columns):
variable1     3661 non-null object
variable2     3661 non-null float64
variable3     3700 non-null float64
variable4     3636 non-null object
variable5     3636 non-null object
variable6     3634 non-null object
variable7     3634 non-null object
variable8     3700 non-null float64
variable9     3700 non-null object
variable10    3700 non-null object
variable11    3700 non-null int64
variable12    3700 non-null object
variable13    3700 non-null object
variable14    3600 non-null float64
variable15    3700 non-null int64
variable17    3600 non-null float64
variable18    1555 non-null object
variable19    3700 non-null int64
classLabel    3700 non-null object
dtypes: float64(5), int64(3), object(11)
memory usage: 549.3+ KB


In [6]:
validation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 19 columns):
variable1     197 non-null object
variable2     197 non-null float64
variable3     200 non-null float64
variable4     198 non-null object
variable5     198 non-null object
variable6     197 non-null object
variable7     197 non-null object
variable8     200 non-null float64
variable9     200 non-null object
variable10    200 non-null object
variable11    200 non-null int64
variable12    200 non-null object
variable13    200 non-null object
variable14    197 non-null float64
variable15    200 non-null int64
variable17    197 non-null float64
variable18    89 non-null object
variable19    200 non-null int64
classLabel    200 non-null object
dtypes: float64(5), int64(3), object(11)
memory usage: 29.8+ KB


#### How many duplicated rows in both datasets?

In [7]:
print(train.duplicated().sum(), 'Duplicated rows in train dataset')
print(validation.duplicated().sum(), 'Duplicated rows in validation dataset')

3210 Duplicated rows in train dataset
0 Duplicated rows in validation dataset


#### What is the distribution of classlable in train dataset?<br>Is train data balanced?

In [8]:
print('Percentage of yes/no values of classlabel in train dataset :')
print(train['classLabel'].value_counts(normalize=True) * 100)

Percentage of yes/no values of classlabel in train dataset :
yes.    92.540541
no.      7.459459
Name: classLabel, dtype: float64


#### Assessing results: <br><br>1- 'variable18' has 58% missing values in train and 55% missing values in validation dataset.<br><br>2- 'variable5' is redundant of 'variable4'; when it's 'u' in variable4, it's 'g' in variable5 and so on.<br><br>3- 'variable17' equals 'variable14' * 10000.<br><br>4- 'variable19' is redundant of 'classlabel'; when it's 0 in variable19, it's 'no.' in classlabel and so on.<br> <br>5- There are missing values in variables 1, 2, 4, 5, 6, 7, 14, 17.<br><br>6- There are 3210 duplicated rows in train dataset.<br><br>7- Variables 2, 3 and 8 are object datatype and have numerical values. <br><br>8- 92.5% out of train dataset classlabels are 'yes.' and only 7.5% are 'no.' ( Imbalanced data problem)

<img src="https://media.giphy.com/media/kXBVtKjLxINji/giphy.gif">

# Data Cleaning 
#### Let's fix these issues that we've listed in data assessing phase:

#### First, we will drop 'variable18' as it has more than 50% missing values in both datasets and variables 5,17 and 19 as they have no need and can cause overfitting to our model.

In [9]:
# Removing variable5, variable17, variable18 and variable19 from train and validation datasets.
train.drop(['variable5', 'variable17', 'variable18', 'variable19'], axis=1, inplace=True)
validation.drop(['variable5', 'variable17', 'variable18', 'variable19'], axis=1, inplace=True)

#### Then, we will drop duplicated rows in train dataset to keep only unique rows. As keeping duplicated rows may cause overfitting in our classification model. ( It will also solve the problem of imbalanced data )

In [10]:
# Droping duplicated rows in train dataset.
train.drop_duplicates(keep = 'first', inplace = True)
train.shape

(490, 15)

#### Now, the problem of imbalanced data is solved.

In [11]:
print('Percentage of yes/no values of classlabel in train dataset :')
print(train['classLabel'].value_counts(normalize=True) * 100)

Percentage of yes/no values of classlabel in train dataset :
no.     56.326531
yes.    43.673469
Name: classLabel, dtype: float64


#### We still have missing values in both datasets, so we are going to handle them by imputing them with the most used value for categorical data and median for numerical data.

In [12]:
print(train.isnull().any(axis = 1).sum(), 'Rows have null values in train dataset')
print(validation.isnull().any(axis = 1).sum(), 'Rows have null values in validation dataset')

28 Rows have null values in train dataset
9 Rows have null values in validation dataset


In [13]:
columns = train.columns
for col in columns:
    if(train[col].dtype == np.dtype('O')):
        train[col].fillna(train[col].value_counts().idxmax(), inplace=True)
        validation[col].fillna(validation[col].value_counts().idxmax(), inplace=True)
    else:
        train[col].fillna(train[col].median(), inplace=True)
        validation[col].fillna(validation[col].median(), inplace=True)

In [14]:
print(train.isnull().any(axis = 1).sum(), 'Rows have null values in train dataset')
print(validation.isnull().any(axis = 1).sum(), 'Rows have null values in validation dataset')

0 Rows have null values in train dataset
0 Rows have null values in validation dataset


#### Finally, we convert variables 2, 3 and 8 datatypes as they are in string datatypes and hold numerical values.

In [15]:
train[['variable2', 'variable3', 'variable8']] = train[['variable2', 'variable3', 'variable8']].astype(float)
validation[['variable2', 'variable3', 'variable8']] = validation[['variable2', 'variable3', 'variable8']].astype(float)

#### Now, we have clean datasets without missing values or duplicated rows and with the right datatype for every variable. Train dataset has 490 rows and validation dataset has 200 rows.

In [16]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 490 entries, 0 to 489
Data columns (total 15 columns):
variable1     490 non-null object
variable2     490 non-null float64
variable3     490 non-null float64
variable4     490 non-null object
variable6     490 non-null object
variable7     490 non-null object
variable8     490 non-null float64
variable9     490 non-null object
variable10    490 non-null object
variable11    490 non-null int64
variable12    490 non-null object
variable13    490 non-null object
variable14    490 non-null float64
variable15    490 non-null int64
classLabel    490 non-null object
dtypes: float64(4), int64(2), object(9)
memory usage: 61.2+ KB


In [17]:
validation.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 15 columns):
variable1     200 non-null object
variable2     200 non-null float64
variable3     200 non-null float64
variable4     200 non-null object
variable6     200 non-null object
variable7     200 non-null object
variable8     200 non-null float64
variable9     200 non-null object
variable10    200 non-null object
variable11    200 non-null int64
variable12    200 non-null object
variable13    200 non-null object
variable14    200 non-null float64
variable15    200 non-null int64
classLabel    200 non-null object
dtypes: float64(4), int64(2), object(9)
memory usage: 23.6+ KB


# Data Splitting

#### Now, it's time to split our datasets to dependent and independent variables (X,Y) and prepare them to our model.

In [18]:
X_train = train.iloc[:, :-1]
Y_train = train.iloc[:, -1]
X_valid = validation.iloc[: , :-1]
Y_valid = validation.iloc[: , -1]

# Data Encoding
#### First step in incoding our data is to encode classlabels to be 0 or 1 instead of 'yes.' or 'no.' values.

In [19]:
# Encoding Classlabel data to be 0 or 1.
labelencoder_Y = LabelEncoder()
Y_train = labelencoder_Y.fit_transform(Y_train)
Y_valid = labelencoder_Y.fit_transform(Y_valid)

#### Encoding categorical variables in both train and validation datasets using get_dummies() function and drop the original columns.

In [20]:
columns = X_train.columns
# For every column in both train and validation datasets:
for col in columns:
    # If the coulmn has categorical values 'Object datatype':
    if(X_train[col].dtype == np.dtype('O')):
        # Get the dummy varaibles of this coulmn in train and validation datasets and add the name of the column as prefix.
        t_dummies = pd.get_dummies(X_train[col], prefix=col, prefix_sep='_')
        v_dummies = pd.get_dummies(X_valid[col], prefix=col, prefix_sep='_')
        # Add 'concatinate' these varaibles to the datasets
        X_valid = pd.concat([X_valid , v_dummies] , axis = 'columns')
        X_train = pd.concat([X_train , t_dummies] , axis = 'columns')
        # Drop the original column in both datasets
        X_train.drop(col, axis=1, inplace=True)
        X_valid.drop(col, axis=1, inplace=True)

#### Now, we have a problem that there are some dummy vairables exist in trian dataset and don't in validation dataset.

In [21]:
try:
    X_train.columns == X_valid.columns
except:
    print('Lengths of two datasets are not the same')

Lengths of two datasets are not the same


#### To solve this problem, we are going to add every column in train dataset that doesn't exist in validation dataset with 0 values and in the same index in validation dataset.

In [22]:
Ncol = len (X_train.columns)
for i in range (Ncol):
    if (X_train.columns[i] in X_valid.columns) == 0:
        X_valid.insert(i, X_train.columns[i], 0)
        
X_train.sort_index(axis=1, inplace=True)
X_valid.sort_index(axis=1, inplace=True)        
variables = X_train.columns        

#### Solved.

In [23]:
X_train.columns == X_valid.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True])

# Data Scaling
#### We are going now to re-scale the data from 0 to 1 to be more fast and efficient in models.

In [24]:
from sklearn.preprocessing import MinMaxScaler
sc_X = MinMaxScaler()
X_train = sc_X.fit_transform(X_train)
X_valid = sc_X.transform(X_valid) 

# Data Modeling
#### That is the last step, the data is ready now for modeling. We will try deffrent machine learning models and choose the best of them.

<img src="https://media.giphy.com/media/FA77mwaxV74SA/giphy.gif">

In [25]:
random.seed( 5 )

#### Let's start with simple logistic regression model: 

In [26]:
# Train and predict data with Logistic Regression
from sklearn.linear_model import LogisticRegression
LR_classifier = LogisticRegression(solver = 'lbfgs' , random_state = 0)
LR_classifier.fit(X_train, Y_train)
y_pred  = LR_classifier.predict(X_valid)
acc_log = accuracy_score(Y_valid , y_pred) * 100
print("Accuracy with logistic classifier is " , acc_log , '%')

Accuracy with logistic classifier is  87.0 %


#### K-Nearst Neighbors model:

In [27]:
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier(n_neighbors = 16)
knn_classifier.fit(X_train, Y_train)
y_pred = knn_classifier.predict(X_valid)
acc_KNN = accuracy_score(Y_valid , y_pred) * 100
print("Accuracy with K-Nearest Neighbors Classifier (KNN) is " , acc_KNN , '%')

Accuracy with K-Nearest Neighbors Classifier (KNN) is  87.5 %


#### Support Vector Classifier model:

In [28]:
# Train and predict data with Support Vector Classifier (SVC)
from sklearn.svm import SVC
svc_classifier = SVC(kernel = 'rbf', random_state = 0, gamma= 0.66)
svc_classifier.fit(X_train, Y_train)
y_pred = svc_classifier.predict(X_valid)
acc_SVC = accuracy_score(Y_valid , y_pred) * 100
print("Accuracy with Support Vector Classifier (SVC) is " , acc_SVC , '%')

Accuracy with Support Vector Classifier (SVC) is  86.0 %


#### RandomForest Classifier model:

In [29]:
# Train and predict data with RandomForest Classifier 
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators = 10 , criterion = 'entropy' , random_state = 0)
rf_classifier.fit(X_train , Y_train) 
y_pred = rf_classifier.predict(X_valid)
acc_RF = accuracy_score(Y_valid , y_pred) * 100
print("Accuracy with RandomForest Classifier is " , acc_RF , '%')

Accuracy with RandomForest Classifier is  87.5 %


#### Simple neural network using MLPClassifier:

In [30]:
from sklearn.neural_network import MLPClassifier
mlp_classifier = MLPClassifier(solver='lbfgs', alpha=0.01, hidden_layer_sizes=(10, 7, 5, 5), random_state=1)
mlp_classifier.fit(X_train, Y_train)
train_predictions = mlp_classifier.predict(X_train)
test_predictions = mlp_classifier.predict(X_valid)
train_acc = round(accuracy_score(Y_train, train_predictions) , 2) * 100
test_acc = round(accuracy_score(Y_valid, test_predictions) , 2) * 100
print("Train accuracy with MLP is: ", train_acc , '%')
print("Test accuracy with MLP is: ", test_acc , '%')

Train accuracy with MLP is:  87.0 %
Test accuracy with MLP is:  86.0 %


#### What if we tried to design neural network using keras?!

In [31]:
# Train and predict data with Classification Neural Network
from keras import Sequential
from keras.regularizers import l2
from keras.layers import Dense

nn_classifier = Sequential()
nn_classifier.add(Dense(5, activation='relu', kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01), input_dim=43))
nn_classifier.add(Dense(5, activation='relu', kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01) ))
nn_classifier.add(Dense(2, activation='relu', kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)))
nn_classifier.add(Dense(1, activation='sigmoid'))
nn_classifier.compile(optimizer ='adam',loss='binary_crossentropy', metrics =['accuracy'])
nn_classifier.fit(X_train,Y_train, batch_size=20, epochs=100)

train_acc = round(nn_classifier.evaluate(X_train, Y_train)[1] *100 , 2)
test_acc = round(nn_classifier.evaluate(X_valid, Y_valid)[1] *100 , 2)

print("Train accuracy with Nueral Network Classifier is "  , train_acc, '%')
print("Test accuracy with Nueral Network Classifier is "  , test_acc, '%')

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch

Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Train accuracy with Nueral Network Classifier is  89.18 %
Test accuracy with Nueral Network Classifier is  86.5 %


### Modeling results:
#### We have tried 6 diffrent models ( Logistic Regression, K-NN, SVC, RandomForest, MLP and cusrtom nueral netwok )<br><br>Accuracy varies between 86% and 88%. Most frequent accuracy among these models is 87% for validation dataset. 

# Microservice to serve the predictions
#### let's design microservice in form of simple function that takes the data as an input and returns the predictions of our model as an output.

In [32]:
def predict(df):
    # Check for unneeded column that we dropped above and drop them, if found.
    unneeded_cols = ['variable5', 'variable17', 'variable18', 'variable19','classLabel']
    for col in unneeded_cols:
        if col in df.columns:
            df.drop(col, axis=1, inplace=True)
    
    # Encode categorical data.
    columns = df.columns        
    for col in columns:
        # If the coulmn has categorical values 'Object datatype':
        if(df[col].dtype == np.dtype('O')):
            # Get the dummy varaibles of this coulmn and add the name of the column as prefix
            dummies = pd.get_dummies(df[col], prefix=col, prefix_sep='_')
            # Add 'concatinate' these varaibles to the dataset
            df = pd.concat([df , dummies] , axis = 'columns')
            # Drop the original column in both datasets
            df.drop(col, axis=1, inplace=True)      
            
    # Add variables that don't exist in dataset by value 0        
    Ncol = len (variables)
    for i in range (Ncol):
        if (variables[i] not in df.columns):
            df.insert(len(df.columns), variables[i], 0)   
    # Sort the coulmns to be in the form of train data.        
    df.sort_index(axis=1, inplace=True)        
    # Scale the dataset with the same scaler that we used in train data          
    df = sc_X.transform(df)
    
    # Now let's predict from each model:
    logisic_pred = LR_classifier.predict(df)
    knn_pred = knn_classifier.predict(df)
    svc_pred = svc_classifier.predict(df)
    rf_pred = rf_classifier.predict(df)
    mlp_pred = mlp_classifier.predict(df)
    nn_pred = nn_classifier.predict_classes(df)[0]
    preds = {'Logisic Regression Predictions' : logisic_pred,
             'K-NN Predictions' : knn_pred,
             'SVC Predictions' : svc_pred,
             'RandomForest Predictions' : rf_pred,
             'Nueral Network Predictions' : mlp_pred}
    
    predictions = pd.DataFrame(preds)
    return predictions

#### Let's test this microservice on sample of validation data: 

In [33]:
df = pd.read_csv('validation.csv', sep =";", decimal=',').iloc[1:11 , :].reset_index(drop=True)
classlabels = df['classLabel']
predictions = predict(df)
predictions['Real ClassLabel'] = classlabels
predictions

Unnamed: 0,Logisic Regression Predictions,K-NN Predictions,SVC Predictions,RandomForest Predictions,Nueral Network Predictions,Real ClassLabel
0,0,0,0,0,0,no.
1,0,0,0,0,0,no.
2,0,0,0,1,0,no.
3,0,0,0,0,0,no.
4,0,0,0,0,0,no.
5,0,0,0,0,0,no.
6,0,0,0,0,0,no.
7,0,0,0,0,0,no.
8,0,0,0,0,0,no.
9,0,0,0,0,0,no.
