In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
# importing data
data=pd.read_csv('DATASET-2.csv')
data.head()

Unnamed: 0,Blue,Yellow,Red,Pink,Gray,Green,White,Angle,Pain
0,1,0,1,0.011019,0,0,-0.046341,0,0
1,1,0,1,1.005759,0,0,-0.04324,90,0
2,1,0,1,2.00784,0,0,-0.049941,90,0
3,1,0,1,3.007802,0,0,-0.037382,90,0
4,1,0,1,4.007293,0,0,-0.032407,90,0


In [3]:
# creating X and Y variables
X=data.drop('Pain',axis=1)
cols=X.columns
Y=data['Pain']

# scaling the X Variables so that all the variables are on the same scale
scaler=StandardScaler()
X=scaler.fit_transform(X)
X=pd.DataFrame(X)
X.columns=cols
X

Unnamed: 0,Blue,Yellow,Red,Pink,Gray,Green,White,Angle
0,0.868956,-0.523444,3.546876,-1.677924,-1.408450,-0.013808,-0.711169,-23.878237
1,0.868956,-0.523444,3.546876,-1.666502,-1.408450,-0.013808,-0.670109,0.133592
2,0.868956,-0.523444,3.546876,-1.654995,-1.408450,-0.013808,-0.758837,0.133592
3,0.868956,-0.523444,3.546876,-1.643514,-1.408450,-0.013808,-0.592543,0.133592
4,0.868956,-0.523444,3.546876,-1.632037,-1.408450,-0.013808,-0.526669,0.133592
...,...,...,...,...,...,...,...,...
4137,0.868956,0.825719,-0.281938,-0.679031,0.544113,0.816702,0.775641,0.133592
4138,0.868956,-0.523444,-0.281938,-0.449350,0.585656,0.484498,-0.530509,0.133592
4139,0.868956,2.174883,-0.281938,1.617489,0.980323,0.761335,-0.574257,0.133592
4140,-1.150806,-0.523444,-0.281938,1.720940,0.751832,0.650600,1.000010,0.133592


In [4]:
# Doing a training and test split where test size is 30%
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=2) 


In [5]:
X_train

Unnamed: 0,Blue,Yellow,Red,Pink,Gray,Green,White,Angle
2468,-1.150806,-0.523444,-0.281938,1.043180,-1.366906,-0.955054,0.660761,0.133592
3115,0.868956,-0.523444,-0.281938,-0.724954,-0.349081,0.872070,-0.148333,0.133592
3497,0.868956,2.174883,-0.281938,-1.390811,-1.263047,0.263028,0.612048,0.133592
2578,0.868956,0.825719,-0.281938,0.423238,0.336393,0.816702,0.672692,0.133592
69,0.868956,-0.523444,3.546876,-0.885609,-0.452941,0.872070,-0.563386,0.133592
...,...,...,...,...,...,...,...,...
3335,0.868956,-0.523444,-0.281938,-1.012010,-0.515257,-2.283871,0.381746,0.133592
1099,0.868956,0.825719,-0.281938,-1.230203,1.229587,0.484498,-0.188731,0.133592
2514,0.868956,0.825719,-0.281938,0.147635,1.104955,0.650600,1.537504,0.133592
3606,0.868956,-0.523444,-0.281938,1.341981,1.188043,0.373763,1.365900,0.133592


In [6]:
# fitting logistic regression
mod=LogisticRegression(random_state=1)
mod.fit(X_train,Y_train)
# predicting on test set
y_prediction=mod.predict(X_test)
# classification report
print(classification_report(y_true=Y_test,y_pred=y_prediction))

              precision    recall  f1-score   support

           0       0.75      0.71      0.73       621
           1       0.72      0.77      0.75       622

    accuracy                           0.74      1243
   macro avg       0.74      0.74      0.74      1243
weighted avg       0.74      0.74      0.74      1243





In [7]:
# the logistic regression coefficients
coefs=dict(zip(list(cols),mod.coef_[0]))
coefs['intercept']=mod.intercept_
coefs

{'Angle': 0.48447688966276214,
 'Blue': 1.0560265493621857,
 'Gray': -0.12908767170878332,
 'Green': 0.02051864358629906,
 'Pink': 0.8428037935071683,
 'Red': -0.9807139594141167,
 'White': 0.011229957367944587,
 'Yellow': -0.008834135527071126,
 'intercept': array([-0.04337965])}

Since logistic regression calculates the probabilities of a class being 1 by taking the sigmoid of product of coefficients with predictors, therefore, the highest positive contributor will be strongest in determining the class label 1 and the lowest negative contrbutor will be the strongest in determining the class label as 0. 

In [8]:
# calculating the contributions to the sigmoid.
data['Angle_contri']=data['Angle']*coefs['Angle']
data['Blue_contri']=data['Blue']*coefs['Blue']
data['Gray_contri']=data['Gray']*coefs['Gray']
data['Green_contri']=data['Green']*coefs['Green']
data['Pink_contri']=data['Pink']*coefs['Pink']
data['Red_contri']=data['Red']*coefs['Red']
data['White_contri']=data['White']*coefs['White']
data['Yellow_contri']=data['Yellow']*coefs['Yellow']


In [9]:
new_cols=['Angle','Blue','Gray','Green','Pink','Red','White','Yellow']
# calculating the maximum contributor if Pain =1 or the minumym contributor if Pain=0
importance=[]
# loop through the entire data
for j in range(data.shape[0]):
    # take the calculated values
    vals=data.loc[j,['Angle_contri','Blue_contri','Gray_contri','Green_contri','Pink_contri','Red_contri','White_contri','Yellow_contri']].values
    max_index=vals.argmax()# index of maxima in the calculated values
    min_index=vals.argmin()# index of minima in the calculated values
    # if pain is 0 , then take the variable giving the minimum value else take the variable giving maximum value
    if data.loc[j,'Pain']==0:
        importance.append(new_cols[min_index])
    else:
        importance.append(new_cols[max_index])
    

In [10]:
# setting the importance in data
data['importance']=importance

In [11]:
data

Unnamed: 0,Blue,Yellow,Red,Pink,Gray,Green,White,Angle,Pain,Angle_contri,Blue_contri,Gray_contri,Green_contri,Pink_contri,Red_contri,White_contri,Yellow_contri,importance
0,1,0,1,0.011019,0,0,-0.046341,0,0,0.00000,1.056027,-0.000000,0.000000,0.009287,-0.980714,-0.000520,-0.000000,Red
1,1,0,1,1.005759,0,0,-0.043240,90,0,43.60292,1.056027,-0.000000,0.000000,0.847658,-0.980714,-0.000486,-0.000000,Red
2,1,0,1,2.007840,0,0,-0.049941,90,0,43.60292,1.056027,-0.000000,0.000000,1.692215,-0.980714,-0.000561,-0.000000,Red
3,1,0,1,3.007802,0,0,-0.037382,90,0,43.60292,1.056027,-0.000000,0.000000,2.534987,-0.980714,-0.000420,-0.000000,Red
4,1,0,1,4.007293,0,0,-0.032407,90,0,43.60292,1.056027,-0.000000,0.000000,3.377362,-0.980714,-0.000364,-0.000000,Red
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4137,1,1,0,87.005350,94,15,0.065947,90,1,43.60292,1.056027,-12.134241,0.307780,73.328439,-0.000000,0.000741,-0.008834,Pink
4138,1,0,0,107.008400,96,9,-0.032697,90,1,43.60292,1.056027,-12.392416,0.184668,90.187085,-0.000000,-0.000367,-0.000000,Pink
4139,1,2,0,287.010900,115,14,-0.036001,90,1,43.60292,1.056027,-14.845082,0.287261,241.893875,-0.000000,-0.000404,-0.017668,Pink
4140,0,0,0,296.020600,104,12,0.082892,90,1,43.60292,0.000000,-13.425118,0.246224,249.487285,-0.000000,0.000931,-0.000000,Pink


In [12]:
data.to_csv('importance.csv')