In [1]:
import numpy as np
import pandas as pd

import plotly
import plotly.figure_factory as ff
import plotly.graph_objs as go
# https://www.tutorialspoint.com/plotly/plotly_plotting_inline_with_jupyter_notebook.htm
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [2]:
data = pd.read_csv('task_b.csv')
data = data.iloc[:,1:]

In [3]:
data.head()

Unnamed: 0,f1,f2,f3,y
0,-195.871045,-14843.084171,5.53214,1.0
1,-1217.183964,-4068.124621,4.416082,1.0
2,9.138451,4413.412028,0.425317,0.0
3,363.824242,15474.760647,1.094119,0.0
4,-768.812047,-7963.932192,1.870536,0.0


In [4]:
data.corr()['y']

f1    0.067172
f2   -0.017944
f3    0.839060
y     1.000000
Name: y, dtype: float64

In [5]:
data.std()

f1      488.195035
f2    10403.417325
f3        2.926662
y         0.501255
dtype: float64

In [6]:
X = data[['f1','f2','f3']].values
Y = data['y'].values
print(X.shape)
print(Y.shape)

(200, 3)
(200,)


# What if the features are with different variance 

As part of this task you will observe how linear models work in case of data having features with different variance from the output of the above cells you can observe that var(F2) >> var(F1) >> var(F3)

- <b>Task1</b>:
    1. Apply Logistic regression (SGDClassifier with logloss) on 'data' and check the feature importance
    2. Apply SVM (SGDClassifier with hinge) on 'data' and check the feature importance


- <b>Task2</b>:
    1. Apply Logistic regression (SGDClassifier with logloss) on 'data' after standardization 
       i.e standardization(data, column wise) : (column-mean(column))/std(column) and check the feature importance
    2. Apply SVM (SGDClassifier with hinge) on 'data' after standardization 
       i.e standardization(data, column wise) : (column-mean(column))/std(column) and check the feature importance

## Task 1

In [7]:
features = ['f1','f2','f3']

clf_lr = SGDClassifier(loss = 'log')
clf_svm = SGDClassifier(loss = 'hinge')

clf_lr.fit(X, Y)
clf_svm.fit(X, Y)

lr_coef = clf_lr.coef_[0]
svm_coef = clf_svm.coef_[0]

lr_important_features = zip(lr_coef, features)
svm_important_features = zip(svm_coef, features)

print('Important features of Logistic Regression', sorted(list(lr_important_features), reverse = True))
print('Important features of SVM', sorted(list(svm_important_features), reverse = True))

Important features of Logistic Regression [(10189.669879633142, 'f3'), (7878.773036472703, 'f1'), (-4945.734782348944, 'f2')]
Important features of SVM [(11027.993538822657, 'f3'), (1802.1097429890663, 'f1'), (-4054.4129056170264, 'f2')]


#### Observations

- The importance of features is related to correlation between them and target variable. Correlation between feature `f3` and the target variable is higher. Hence, `f3` has greater importance in both SVM & Logistic Regression.

- When we are using SGDClassifier and since it is probabilistic in nature, the behavior changes over multiple runs giving different feature importances each time.

- f3 > f1 > f2 is the order of feature importances. 

- Since the variance of `f2` is higher, it has lower correlation to target. So, `f2` gets the least feature importance.

## Task 2 

In [8]:
standardize = StandardScaler()
x_std = standardize.fit_transform(X, Y)

features = ['f1','f2','f3']

clf_lr = SGDClassifier(loss = 'log')
clf_svm = SGDClassifier(loss = 'hinge')

clf_lr.fit(x_std, Y)
clf_svm.fit(x_std, Y)

lr_coef = clf_lr.coef_[0]
svm_coef = clf_svm.coef_[0]

lr_important_features = zip(lr_coef, features)
svm_important_features = zip(svm_coef, features)

print('Important features of Logistic Regression', sorted(list(lr_important_features), reverse = True))
print('Important features of SVM', sorted(list(svm_important_features), reverse = True))

Important features of Logistic Regression [(8.724431271363455, 'f3'), (0.10183976899392214, 'f2'), (-1.2769677830803743, 'f1')]
Important features of SVM [(22.77132186214848, 'f3'), (4.049457563006254, 'f2'), (-1.1922997268697, 'f1')]


#### Observations

- After standardization, the effect of variance is nullified.

- The importance of features is related to correlation between them and target variable. Correlation between feature `f3` and the target variable is higher. Hence, `f3` has greater importance in both SVM & Logistic Regression.

- f3 > f2 > f1 is the new order of feature importances.

- After standardization, the most important feature is `f3`.