In [63]:
import numpy as np
import pandas as pd
import plotly
import plotly.figure_factory as ff
import plotly.graph_objs as go
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [93]:
data = pd.read_csv('task_b.csv')
data=data.iloc[:,1:]

In [94]:
data.head()

Unnamed: 0,f1,f2,f3,y
0,-195.871045,-14843.084171,5.53214,1.0
1,-1217.183964,-4068.124621,4.416082,1.0
2,9.138451,4413.412028,0.425317,0.0
3,363.824242,15474.760647,1.094119,0.0
4,-768.812047,-7963.932192,1.870536,0.0


In [95]:
data.corr()['y']

f1    0.067172
f2   -0.017944
f3    0.839060
y     1.000000
Name: y, dtype: float64

In [96]:
data.std()

f1      488.195035
f2    10403.417325
f3        2.926662
y         0.501255
dtype: float64

In [98]:
X=data[['f1','f2','f3']].values
Y=data['y'].values
print(X.shape)
print(Y.shape)

(200, 3)
(200,)


# What if our features are with different variance 

<pre>
* <b>As part of this task you will observe how linear models work in case of data having feautres with different variance</b>
* <b>from the output of the above cells you can observe that var(F2)>>var(F1)>>Var(F3)</b>

> <b>Task1</b>:
    1. Apply Logistic regression(SGDClassifier with logloss) on 'data' and check the feature importance
    2. Apply SVM(SGDClassifier with hinge) on 'data' and check the feature importance

> <b>Task2</b>:
    1. Apply Logistic regression(SGDClassifier with logloss) on 'data' after standardization 
       i.e standardization(data, column wise): (column-mean(column))/std(column) and check the feature importance
    2. Apply SVM(SGDClassifier with hinge) on 'data' after standardization 
       i.e standardization(data, column wise): (column-mean(column))/std(column) and check the feature importance

</pre>

<h3><font color='blue'> Make sure you write the observations for each task, why a particular feautre got more importance than others</font></h3>

# **Task 1**



> 
# 1. Logistic Regression(SGDClassifier with logloss)


In [100]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='log')
clf.fit(X,Y)
cols = data.columns[:-1]
feat_imp = dict()
for idx,val in enumerate(clf.coef_.ravel()):
    feat_imp[cols[idx]] = val


res = pd.Series(feat_imp)
print("Feature importance of each features for Logistic Regression: \n")
print(res)

Feature importance of each features for Logistic Regression: 

f1     7191.801388
f2     -620.956861
f3    10174.854074
dtype: float64


Observation For Logistic Regression implentation

1.   Feature 3 is the most important and feature 2 is the least important feature.
 
> It was expected to follow such behaviour as feature f3 is highly correlated with our target variable which is followed by f1 and lastly f2


2.   The coefficient value is very high as we haven't standardised our data.





# 2.   SVM (SGDClassifier with HingeLoss)



In [84]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier(loss='hinge')
clf.fit(X,Y)
cols = data.columns[:-1]
feat_imp = dict()
for idx,val in enumerate(clf.coef_.ravel()):
    feat_imp[cols[idx]] = val


res = pd.Series(feat_imp)
print("Feature importance of each features for SVM: \n")
print(res)

Feature importance of each features for SVM: 

f1     8482.954465
f2     7827.143407
f3    10435.412566
dtype: float64


Observation

1.   Similar observation was observed for Logistic Regression
2.   f3 is the most importance as it highly correlated with the target variable followed by f1, and f2.

3.   The coefficient value is quite large for the independent variable indicating the effect of the non standardised data.





# Task 2 

## 1. Logistic Regression(SGDClassifier with logloss) with standardized data

In [85]:
f1_mean = np.mean(data['f1'].values)
f2_mean = np.mean(data['f2'].values)
f3_mean = np.mean(data['f3'].values)

f1_std = np.std(data['f1'].values)
f2_std = np.std(data['f2'].values)
f3_std = np.std(data['f3'].values)

In [86]:
data['f1'] = data['f1'].apply(lambda x:(x-f1_mean)/f1_std)
data['f2'] = data['f2'].apply(lambda x:(x-f2_mean)/f2_std)
data['f3'] = data['f3'].apply(lambda x:(x-f3_mean)/f3_std)

data.head()

Unnamed: 0,f1,f2,f3,y
0,-0.423126,-1.555602,0.181651,1.0
1,-2.520394,-0.51729,-0.200648,1.0
2,-0.002139,0.30002,-1.567659,0.0
3,0.726209,1.36593,-1.338565,0.0
4,-1.599662,-0.892703,-1.072608,0.0


In [89]:
from sklearn.linear_model import SGDClassifier
X = data[['f1','f2','f3']]
Y = data['y']


clf = SGDClassifier(loss='log')
clf.fit(X,Y)
cols = data.columns[:-1]
feat_imp = dict()
for idx,val in enumerate(clf.coef_.ravel()):
    feat_imp[cols[idx]] = val


res = pd.Series(feat_imp)
print("Feature importance of each features for Logistic Regression: \n")
print(res)

Feature importance of each features for Logistic Regression: 

f1    -4.581749
f2    -1.261510
f3    13.802107
dtype: float64


Observation For Logistic Regression implementation with data standardization.

1.   Feature 3 is the most important and feature 2 is the least important feature.(Note: Take the magnitude of the feature coefficient)
> It was expected to follow such behaviour as feature f3 is highly correlated with our target variable which is followed by f1 and lastly f2


2.   The coefficient value is reasonable and quite low compared to our lr regression implementation without standardization as we have standardised our data.



In [91]:
from sklearn.linear_model import SGDClassifier
X = data[['f1','f2','f3']]
Y = data['y']


clf = SGDClassifier(loss='hinge')
clf.fit(X,Y)
cols = data.columns[:-1]
feat_imp = dict()
for idx,val in enumerate(clf.coef_.ravel()):
    feat_imp[cols[idx]] = val


res = pd.Series(feat_imp)
print("Feature importance of each features for SVM: \n")
print(res)

Feature importance of each features for SVM: 

f1     3.669889
f2    -0.508139
f3    22.528136
dtype: float64


Observation

1.   Similar observation was observed for Logistic Regression
2.   f3 is the most importance as it highly correlated with the target variable followed by f1, and f2.
3.   The coefficient value is small as compared to SVM implementation without standardization for the independent variable.



