In [1]:
import pandas as pd

In [2]:
data=pd.read_csv("Wall robot navigation.csv",sep=",")

In [3]:
data.head()

Unnamed: 0,V1,V2,V3,V4,Class
0,1.687,0.445,2.332,0.429,4
1,1.687,0.449,2.332,0.429,4
2,1.687,0.449,2.334,0.429,4
3,1.687,0.449,2.334,0.429,4
4,1.687,0.449,2.334,0.429,4


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5456 entries, 0 to 5455
Data columns (total 5 columns):
V1       5456 non-null float64
V2       5456 non-null float64
V3       5456 non-null float64
V4       5456 non-null float64
Class    5456 non-null int64
dtypes: float64(4), int64(1)
memory usage: 213.2 KB


##### V1,V2,V3,V4 are the numerical variables. Class is the categorical variable but is given as integer here.

In [5]:
data[['V1','V2','V3','V4']].describe() # Analyzing only the numerical columns

Unnamed: 0,V1,V2,V3,V4
count,5456.0,5456.0,5456.0,5456.0
mean,1.290307,0.681269,1.881819,1.273689
std,0.6267,0.342594,0.562533,0.82175
min,0.495,0.34,0.836,0.367
25%,0.826,0.495,1.472,0.788
50%,1.0895,0.612,1.753,1.0665
75%,1.5195,0.753,2.139,1.4005
max,5.0,5.0,5.0,5.0


### Normalization of numerical variables

extracting only the numerical columns so as to normalize it between 0 and 1

In [6]:
df=data[['V1', 'V2', 'V3', 'V4']] # Extracting only the numerical columns to a new dataframe so as to normalize them.

In [7]:
import sklearn

In [8]:
from sklearn import preprocessing

In [9]:
df.columns

Index(['V1', 'V2', 'V3', 'V4'], dtype='object')

In [10]:
x=df.values

In [11]:
x

array([[1.687, 0.445, 2.332, 0.429],
       [1.687, 0.449, 2.332, 0.429],
       [1.687, 0.449, 2.334, 0.429],
       ...,
       [0.873, 0.642, 1.053, 1.105],
       [0.967, 0.635, 1.034, 1.118],
       [0.854, 0.628, 1.016, 1.168]])

In [12]:
min_max_scaler = preprocessing.MinMaxScaler()

In [13]:
x_scaled = min_max_scaler.fit_transform(x)

In [14]:
df_normalized = pd.DataFrame(x_scaled,columns='v1 v2 v3 v4'.split())

In [15]:
df_normalized['class']=data['Class']

In [16]:
df_normalized.head()

Unnamed: 0,v1,v2,v3,v4,class
0,0.264595,0.022532,0.35927,0.013382,4
1,0.264595,0.023391,0.35927,0.013382,4
2,0.264595,0.023391,0.35975,0.013382,4
3,0.264595,0.023391,0.35975,0.013382,4
4,0.264595,0.023391,0.35975,0.013382,4


#### Since there are no categorical variables in my features, one hot encoding is not required

# Performing logistic regression to classify

## Process the dataset

In [17]:
df_normalized['class'].value_counts(normalize=1)*100

1    40.414223
2    38.434751
4    15.139296
3     6.011730
Name: class, dtype: float64

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
# create training and testing variables
X_train, X_test, y_train, y_test = train_test_split(df_normalized[['v1','v2','v3','v4']],df_normalized['class'], test_size=0.2,random_state=42,stratify=df_normalized['class'])

In [20]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(4364, 4) (4364,)
(1092, 4) (1092,)


### Using logistic regression

In [21]:
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score

In [22]:
model = LogisticRegression()

In [23]:
model.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [24]:
predicted_class=model.predict(X_test)

In [25]:
accuracy = accuracy_score(y_test,predicted_class)*100

In [26]:
accuracy

71.52014652014653

In [27]:
from sklearn.metrics import classification_report

In [28]:
print(classification_report(y_test,predicted_class))

              precision    recall  f1-score   support

           1       0.64      0.79      0.71       441
           2       0.79      0.98      0.87       420
           3       0.74      0.30      0.43        66
           4       0.00      0.00      0.00       165

   micro avg       0.72      0.72      0.72      1092
   macro avg       0.54      0.52      0.50      1092
weighted avg       0.61      0.72      0.65      1092



  'precision', 'predicted', average, warn_for)


In [29]:
from sklearn.metrics import confusion_matrix

In [30]:
cm=confusion_matrix(y_test, predicted_class)

In [31]:
cm

array([[348,  93,   0,   0],
       [  0, 413,   7,   0],
       [ 34,  12,  20,   0],
       [158,   7,   0,   0]], dtype=int64)

## Classification using neural network model

In [32]:
from sklearn.neural_network import MLPClassifier

In [44]:
nnmodel = MLPClassifier(hidden_layer_sizes=(100,50),random_state=45)

In [45]:
nnmodel.fit(X_train,y_train)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 50), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=45, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [46]:
nnpredicted_class=nnmodel.predict(X_test)

In [47]:
accuracy_score(y_test,nnpredicted_class)

0.9880952380952381

In [48]:
print(classification_report(y_test,nnpredicted_class))

              precision    recall  f1-score   support

           1       0.99      0.99      0.99       441
           2       0.99      0.99      0.99       420
           3       0.97      0.97      0.97        66
           4       0.99      0.99      0.99       165

   micro avg       0.99      0.99      0.99      1092
   macro avg       0.99      0.98      0.98      1092
weighted avg       0.99      0.99      0.99      1092



In [49]:
nn_cm=confusion_matrix(y_test, nnpredicted_class)

In [50]:
nn_cm

array([[435,   4,   1,   1],
       [  2, 417,   1,   0],
       [  2,   0,  64,   0],
       [  2,   0,   0, 163]], dtype=int64)

### Finding out the false postive rate & false negative rate for both the methods - Logistic regression & Multi-layer neural network

In [40]:
import numpy as np

In [41]:
def false_perc(matrix):
    FP = matrix.sum(axis=0) - np.diag(matrix)
    FN = matrix.sum(axis=1) - np.diag(matrix)
    TP = np.diag(matrix)
    TN = matrix.sum() - (FP + FN + TP)
    FP = FP.astype(float)
    FN = FN.astype(float)
    TP = TP.astype(float)
    TN = TN.astype(float)
    FPR = FP/(FP+TN) # FPR - False Positive Rate
    FNR = FN/(TP+FN) # FNR - False Negative Rate
    fr=np.vstack((FPR,FNR))
    fr=fr*100
    false_rate=pd.DataFrame(fr,columns=[1,2,3,4],index=['FPR(%)','FNR(%)'])
    print(false_rate)

In [42]:
false_perc(cm) # for logistic regression

                1          2          3      4
FPR(%)  29.493088  16.666667   0.682261    0.0
FNR(%)  21.088435   1.666667  69.696970  100.0


In [43]:
false_perc(nn_cm) # for neural network

               1         2         3         4
FPR(%)  0.460829  0.297619  0.097466  0.755124
FNR(%)  2.040816  0.476190  3.030303  0.000000


### Experimenting the neural network model with different number of hidden layers and nodes 

In [55]:
nnmodel = MLPClassifier(hidden_layer_sizes=(100,50),random_state=45)
nnmodel.fit(X_train,y_train)
nnpredicted_class=nnmodel.predict(X_test)
accuracy_score(y_test,nnpredicted_class)



0.9880952380952381

In [61]:
nnmodel = MLPClassifier(hidden_layer_sizes=(100,50,30),random_state=45)
nnmodel.fit(X_train,y_train)
nnpredicted_class=nnmodel.predict(X_test)
accuracy_score(y_test,nnpredicted_class)*100

98.992673992674

In [62]:
nnmodel = MLPClassifier(hidden_layer_sizes=(100,50,30,20),random_state=45)
nnmodel.fit(X_train,y_train)
nnpredicted_class=nnmodel.predict(X_test)
accuracy_score(y_test,nnpredicted_class)*100

98.9010989010989

In [63]:
nnmodel = MLPClassifier(hidden_layer_sizes=(100,50,30,15),random_state=45)
nnmodel.fit(X_train,y_train)
nnpredicted_class=nnmodel.predict(X_test)
accuracy_score(y_test,nnpredicted_class)*100

97.8021978021978

Accuracy is reducing when increasing to 4th layer. Hence staying with 3 hidden layers

In [67]:
nnmodel = MLPClassifier(hidden_layer_sizes=(100,50,35),random_state=45)
nnmodel.fit(X_train,y_train)
nnpredicted_class=nnmodel.predict(X_test)
accuracy_score(y_test,nnpredicted_class)*100



98.9010989010989

Best accuracy is when the 3 hidden layers of 100,50 & 30 nodes with an accuracy of 98.99%