In [1]:
import pandas as pd
import numpy as np
import operator

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

In [2]:
def sigmoid(Z):
    return 1.0/(1.0 + np.exp(-Z))

In [3]:
class LogisticRegression:
    
    theta = None
    threshold = 0.72
    
    def set_threshold(self,t):
        self.threshold = t
    
    def convert_to_class(self, target_list):
        return map((lambda a : 0 if a<=self.threshold else 1), target_list)
        #data['COA_logistic'] = map((lambda a : 0 if a<=0.72 else 1), (list(data['Chance of Admit'].values)))
    
    def compute_precision_recall_f1score(self, y_actual, y_predict):
        y_actual = self.convert_to_class(y_actual)
        tp = 0
        tn = 0
        fp = 0
        fn = 0
        for i in range(0,len(y_actual)):
            if y_actual[i]==0 and y_predict[i]==0:
                #true negative
                tn += 1
            if y_actual[i]==0 and y_predict[i]==1:
                #false positive
                fp += 1
            if y_actual[i]==1 and y_predict[i]==0:
                #false negative
                fn += 1
            if y_actual[i]==1 and y_predict[i]==1:
                #true positive
                tp += 1
        precision = float(tp)/(float(tp)+float(fp))
        recall = float(tp)/(float(tp)+float(fn))
        f1score = 2.0/((1.0/float(precision)) + (1.0/float(recall)))
        return (precision,recall,f1score)
    
    def compute_accuracy(self,y_actual, y_predict):
        y_actual = self.convert_to_class(list(y_actual))
        y_predict = list(y_predict)
        hits = 0
        for i in range(0,len(y_actual)):
            if y_actual[i] == y_predict[i]:
                hits+=1
        return float(hits)/float(len(y_actual))
    
    def predict(self, X):
        Y_pred = list(sigmoid(np.dot(X.values,self.theta.T)))
        Y_pred = self.convert_to_class(sigmoid(np.dot(X,self.theta.T)))
        return Y_pred
    
    def compute_error(self, y_pred, y_actual):
        m = len(y_pred)
        return (-1.0/float(m))*np.sum((y_actual*np.log(y_pred)) + ((1.0-y_actual)*np.log(1.0-y_pred)))
    
    def compute_gradient(self, X, h, Y):
        return np.sum(X*(h-Y), axis=0)
    
    def train(self, X_train, y_train_df, alpha, max_epochs):
        self.theta = None
        self.theta = np.random.rand(1,X_train.shape[1])
        y_train_np = y_train_df.values
        y_train_shape = y_train_np.shape
        Y = np.array(self.convert_to_class(list(y_train_np))).reshape((y_train_shape[0], 1))
        m = len(X_train)
        for i in range(0,max_epochs):
            X = X_train.values
            h = sigmoid(np.dot(X,self.theta.T))
            self.theta = self.theta - alpha*self.compute_gradient(X,h,Y)

In [4]:
data = pd.read_csv("wine_data.csv",delimiter=';')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,9.2,0.25,0.34,1.2,0.026,31.0,93.0,0.9916,2.93,0.37,11.3,7
1,6.6,0.2,0.27,10.9,0.038,29.0,130.0,0.99496,3.11,0.44,10.5,7
2,5.7,0.22,0.22,16.65,0.044,39.0,110.0,0.99855,3.24,0.48,9.0,6
3,7.2,0.23,0.39,14.2,0.058,49.0,192.0,0.9979,2.98,0.48,9.0,7
4,7.6,0.35,0.47,13.3,0.037,42.0,116.0,0.99822,3.04,0.5,9.2,5


In [5]:
y_vals = list(data['quality'].unique())
for i in range(0,len(y_vals)):
    for j in range((i+1),len(y_vals)):
        data_copy = data.copy()
        for index,row in data_copy.iterrows():
            if row['quality'] == y_vals[i]:
                data_copy.at[index,'quality'] = 0
            elif row['quality'] == y_vals[j]:
                data_copy.at[index,'quality'] = 1
            else:
                data_copy = data_copy.drop([index])
        X_train, X_test, y_train, y_test = train_test_split(
            data_copy[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide','density','pH','sulphates','alcohol']],
            data_copy[['quality']],
            test_size=0.2,
            random_state=0)
        for col in X_train:
            mean = X_train[col].mean()
            std = X_train[col].std()
            X_train[col] = (X_train[col] - mean)/std
            X_test[col] = (X_test[col]-mean)/std
        X_train['Ones'] = [1]*len(X_train)
        X_test['Ones'] = [1]*len(X_test)
        lg = LogisticRegression()
        lg.train(X_train,y_train,0.05,5000)
        y_pred_train = lg.predict(X_train)
        train_acc = lg.compute_accuracy(list(y_train['quality']),y_pred_train)
        #train_precision,train_recall,train_f1score = lg.compute_precision_recall_f1score(list(y_train['quality']),y_pred_train)
        y_pred_test = lg.predict(X_test)
        test_acc = lg.compute_accuracy(list(y_test['quality']),y_pred_test)
        #test_precision,test_recall,test_f1score = lg.compute_precision_recall_f1score(list(y_test['quality']),y_pred_test)

        print str(y_vals[i])+' = 0 and '+str(y_vals[j])+' = 1'
        print
        print '********************TRAINING SET*********************'
        print 'ACCURACY : '+str(train_acc)
        #print 'PRECISION : '+str(train_precision)
        #print 'RECALL : '+str(train_f1score)
        print '*****************************************************'
        print
        print '********************TEST SET*********************'
        print 'ACCURACY : '+str(test_acc)
        #print 'PRECISION : '+str(test_precision)
        #print 'RECALL : '+str(test_f1score)
        print '*****************************************************'
        print
        print

7 = 0 and 6 = 1

********************TRAINING SET*********************
ACCURACY : 0.67298150654
*****************************************************

********************TEST SET*********************
ACCURACY : 0.67027027027
*****************************************************


7 = 0 and 5 = 1

********************TRAINING SET*********************
ACCURACY : 0.775656324582
*****************************************************

********************TEST SET*********************
ACCURACY : 0.783333333333
*****************************************************


7 = 0 and 4 = 1

********************TRAINING SET*********************
ACCURACY : 0.923796791444
*****************************************************

********************TEST SET*********************
ACCURACY : 0.893048128342
*****************************************************


7 = 0 and 8 = 1

********************TRAINING SET*********************
ACCURACY : 0.825333333333
*****************************************************

In [6]:
y_vals = list(data['quality'].unique())
for i in range(0,len(y_vals)):
    data_copy = data.copy()
    for index,row in data_copy.iterrows():
        if row['quality'] == y_vals[i]:
            data_copy.at[index,'quality'] = 0
        else:
            data_copy.at[index,'quality'] = 1
    X_train, X_test, y_train, y_test = train_test_split(
        data_copy[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide','density','pH','sulphates','alcohol']],
        data_copy[['quality']],
        test_size=0.2,
        random_state=0)
    for col in X_train:
        mean = X_train[col].mean()
        std = X_train[col].std()
        X_train[col] = (X_train[col] - mean)/std
        X_test[col] = (X_test[col]-mean)/std
    X_train['Ones'] = [1]*len(X_train)
    X_test['Ones'] = [1]*len(X_test)
    lg = LogisticRegression()
    lg.train(X_train,y_train,0.05,5000)
    y_pred_train = lg.predict(X_train)
    train_acc = lg.compute_accuracy(list(y_train['quality']),y_pred_train)
    #train_precision,train_recall,train_f1score = lg.compute_precision_recall_f1score(list(y_train['quality']),y_pred_train)
    y_pred_test = lg.predict(X_test)
    test_acc = lg.compute_accuracy(list(y_test['quality']),y_pred_test)
    #test_precision,test_recall,test_f1score = lg.compute_precision_recall_f1score(list(y_test['quality']),y_pred_test)

    print str(y_vals[i])+' = 0 and rest = 1'
    print
    print '********************TRAINING SET*********************'
    print 'ACCURACY : '+str(train_acc)
    #print 'PRECISION : '+str(train_precision)
    #print 'RECALL : '+str(train_f1score)
    print '*****************************************************'
    print
    print '********************TEST SET*********************'
    print 'ACCURACY : '+str(test_acc)
    #print 'PRECISION : '+str(test_precision)
    #print 'RECALL : '+str(test_f1score)
    print '*****************************************************'
    print
    print        

7 = 0 and rest = 1

********************TRAINING SET*********************
ACCURACY : 0.691435053885
*****************************************************

********************TEST SET*********************
ACCURACY : 0.687074829932
*****************************************************


6 = 0 and rest = 1

********************TRAINING SET*********************
ACCURACY : 0.511627906977
*****************************************************

********************TEST SET*********************
ACCURACY : 0.526077097506
*****************************************************




  


5 = 0 and rest = 1

********************TRAINING SET*********************
ACCURACY : 0.569200226886
*****************************************************

********************TEST SET*********************
ACCURACY : 0.548752834467
*****************************************************


4 = 0 and rest = 1

********************TRAINING SET*********************
ACCURACY : 0.937606352808
*****************************************************

********************TEST SET*********************
ACCURACY : 0.942176870748
*****************************************************


8 = 0 and rest = 1

********************TRAINING SET*********************
ACCURACY : 0.966534316506
*****************************************************

********************TEST SET*********************
ACCURACY : 0.956916099773
*****************************************************


3 = 0 and rest = 1

********************TRAINING SET*********************
ACCURACY : 0.995745887691
***************************************