# Logistic Regression Using Scikit-Learn

In [1]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("datasets/Winedata.txt")
print(df)

     class  Alcohol  Malic acid   Ash  Alcalinity of ash  Magnesium  \
0        1    14.23        1.71  2.43               15.6        127   
1        1    13.20        1.78  2.14               11.2        100   
2        1    13.16        2.36  2.67               18.6        101   
3        1    14.37        1.95  2.50               16.8        113   
4        1    13.24        2.59  2.87               21.0        118   
5        1    14.20        1.76  2.45               15.2        112   
6        1    14.39        1.87  2.45               14.6         96   
7        1    14.06        2.15  2.61               17.6        121   
8        1    14.83        1.64  2.17               14.0         97   
9        1    13.86        1.35  2.27               16.0         98   
10       1    14.10        2.16  2.30               18.0        105   
11       1    14.12        1.48  2.32               16.8         95   
12       1    13.75        1.73  2.41               16.0         89   
13    

In [5]:
y = df['class']
df.drop(['class'], 1, inplace=True)
X = np.array(df)
print(X.shape)

(178, 13)


In [6]:
#preprocessing the data
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler().fit(X)
newX=scaler.transform(X)

In [7]:
#spliting data
from sklearn.cross_validation import train_test_split
X_train,X_test,y_train,y_test=train_test_split(newX,y,test_size=0.30,random_state=33)



Now let's quickly import Logistic Regression model from Linear models class in scikit-learn

In [8]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

In [9]:
clf.fit(X_train,y_train)
accuracy=clf.score(X_test,y_test)
print(accuracy)

0.9814814814814815


Now we will build simple function to evaluate classifiers. In this function we will use KFold cross validation and cross validation scores to get how the classifier is performing on the data.

In [10]:
from sklearn.cross_validation import cross_val_score, KFold
from scipy.stats import sem

def evaluate_cross_validation(clf, X, y, K):
    # create a k-fold croos validation iterator
    cv = KFold(len(y), K, shuffle=True, random_state=0)
    # by default the score used is the one returned by score method of the estimator (accuracy)
    scores = cross_val_score(clf, X, y, cv=cv)
    print (scores)
    print (("Mean score: {0:.3f} (+/-{1:.3f})").format(
        np.mean(scores), sem(scores)))

In [11]:
evaluate_cross_validation(clf, X_train, y_train, 5)

[1.         0.96       0.96       1.         0.95833333]
Mean score: 0.976 (+/-0.010)


Now we will build another function to use Sklearn.metrics to find out how the classifiers are performing on data by getting the confusion matrix and classification report. These two techniques are really powerfull and can be used to find out the performance of classifier.

In [12]:
from sklearn import metrics

def train_and_evaluate(clf, X_train, X_test, y_train, y_test):
    
    clf.fit(X_train, y_train)
    
    print ("Accuracy on training set:")
    print (clf.score(X_train, y_train))
    print ("Accuracy on testing set:")
    print (clf.score(X_test, y_test))
    
    y_pred = clf.predict(X_test)
    
    print ("Classification Report:")
    print (metrics.classification_report(y_test, y_pred))
    print ("Confusion Matrix:")
    print (metrics.confusion_matrix(y_test, y_pred))

In [13]:
train_and_evaluate(clf, X_train, X_test, y_train, y_test)

Accuracy on training set:
1.0
Accuracy on testing set:
0.9814814814814815
Classification Report:
             precision    recall  f1-score   support

          1       0.94      1.00      0.97        15
          2       1.00      0.95      0.98        21
          3       1.00      1.00      1.00        18

avg / total       0.98      0.98      0.98        54

Confusion Matrix:
[[15  0  0]
 [ 1 20  0]
 [ 0  0 18]]


As we can see that Logistic Regression is doing really great on the classifying wines into three different groups based on some features. 