In [1]:
import math
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split

from warnings import filterwarnings
filterwarnings('ignore') 

In [2]:
MAXITER = 100

compare our logisitic regression to `SGDClassifier`

In [3]:
toyX = pd.read_csv('data/simple_data.csv') 
toyY = toyX.pop("stroke") #remove target column

alpha = .1 #learning rate

sgd = SGDClassifier(loss='log', max_iter=MAXITER,shuffle=False, tol=None, penalty='none', learning_rate='constant', eta0 = alpha)
sgd.fit(toyX,toyY)

SGDClassifier(eta0=0.1, learning_rate='constant', loss='log', max_iter=100,
              penalty='none', shuffle=False, tol=None)

In [4]:
def printWeights(features, weights):
    """
    print model weights
    """
    if "bias" not in features:
        features = list(features)+["bias"]
    print("\t%30s %10s" % ("Feature", "Weight"))
    for i in range(len(features)):
        print("\t%30s %10.3f" % (features[i], weights[i]))

def normalize(X):
    """
    normalize X to range 0-1
    """
    X = X - X.min() # start at 0
    normalizedX = X/X.max() # set max to 1
    return normalizedX

In [5]:
class LogisticRegression(object):

    def __init__(self, features, labels, alpha, lmbda = 0):
        self.alpha = alpha #learning rate
        self.labels = labels
        self.features = features
        self.lmbda = lmbda#regularization term
        self.labelToNumberDict = {}
        for i in range(len(labels)):#are unique
            self.labelToNumberDict[labels[i]] = i 
        

In [6]:
lr = LogisticRegression(toyX.columns, toyY.unique(),alpha) # create a model similar to the instance SGDClassifier

In [7]:
class LogisticRegression(LogisticRegression):
    
    def loss(self, xi):
        npWeights = self.weights#.to_numpy()
        npXi = xi.to_numpy()
        return 1/(1+np.exp(-1*np.dot(npWeights, npXi)))
    
    def fit(self, Xinput, Y):
        """
        train model weights via gradient descent
        """
        X = Xinput.copy()
        X["bias"] = 1#add a bias
        self.weights = np.zeros(len(X.columns))
        self.w = self.weights
        for z in range(MAXITER):
            gradient = np.zeros(len(X.columns))
            for index, xi in X.iterrows(): #might be problem
                p = self.loss(xi)
                yi = self.labelToNumberDict[Y[index]]
                error = p - yi

                for j in range(len(X.columns)):
                  gradient[j] = error*xi.iloc[j]
                self.weights -= alpha*gradient


In [8]:
lr =  LogisticRegression(toyX.columns, toyY.unique(),alpha) # create our model
lr.fit(toyX,toyY) #fit the same data as the SGDClassifier

print("Coefficients and intercept for SKlearn SGDClassifier: ")
weights = list(sgd.coef_[0]) + list(sgd.intercept_) 
printWeights(toyX.columns,  weights)

print("Coefficients and intercept for Logistic Regression: ")
printWeights(toyX.columns,  lr.w)

Coefficients and intercept for SKlearn SGDClassifier: 
	                       Feature     Weight
	                           age      1.967
	                    cholestrol     -1.888
	                          bias     -4.422
Coefficients and intercept for Logistic Regression: 
	                       Feature     Weight
	                           age      1.967
	                    cholestrol     -1.888
	                          bias     -4.422


In [9]:
class LogisticRegression(LogisticRegression):
    
    def predict(self, Xinput):
        """
        returns array of predicted labels for each exmple in Xinput
        """
        X = Xinput.copy()
        X["bias"] = 1#add a bias
        predictions = []
        #print(f"X type: {type(X)}, X: {X}")
        for index, xi in X.iterrows():
            predictions.append(self.labels[int(np.round(self.loss(xi)))])
        return predictions

    def predict_prob(self, Xinput):
        """
        returns a 2D array of size [# examples, # classes] containing the probability 
        of each prediction.  
        """
        X = Xinput.copy()
        X["bias"] = 1#add a bias
        predictions = []
        for index, xi in X.iterrows():
            loss = self.loss(xi)
            temp = []
            temp.append(loss)
            temp.append(abs(1-loss))
            predictions.append(temp)
        return predictions
        
    def get_weights(self):
        return self.weights
    
    def print_errors(self, X_test, y_test, predictions, num_errors):
        print(str(num_errors) + " Errors: ")
        for i in range(len(predictions)):
            if num_errors == 0:
                break
            correct = y_test.iloc[i]
            if predictions[i] != correct:
                print("X_test: ", X_test.iloc[i], "\n prediction: ", predictions[i], "actual: ", y_test.iloc[i])
                print('\n')
                num_errors -= 1

In [10]:
print("Accuracy and Predictions for SKlearn SGDClassifier: ")
print("Accuracy: %0.2f%%" % (sgd.score(toyX,toyY)*100))
predictions = sgd.predict(toyX)
print("%12s %12s" % ("Prediction", "Truth"))
print("-"*25)
for i in range(len(predictions)):
    print("%12s %12s" % (predictions[i], toyY[i]))
print()

lr = LogisticRegression(toyX.columns, toyY.unique(),alpha) # create a model similar to the SGDClassifier above
lr.fit(toyX,toyY) #fit the same data as the SGDClassifier
print("Accuracy and Predictions for Logistic Regression: ")
print("Accuracy: %0.2f%%" % (len(toyY[toyY == lr.predict(toyX)])/len(toyY)*100))
predictions = lr.predict(toyX)
print("%12s %12s" % ("Prediction", "Truth"))
print("-"*25)
for i in range(len(predictions)):
    print("%12s %12s" % (predictions[i], toyY[i]))

lr.predict_prob(toyX)

Accuracy and Predictions for SKlearn SGDClassifier: 
Accuracy: 100.00%
  Prediction        Truth
-------------------------
         Neg          Neg
         Pos          Pos
         Neg          Neg
         Pos          Pos
         Neg          Neg
         Pos          Pos
         Neg          Neg

Accuracy and Predictions for Logistic Regression: 
Accuracy: 100.00%
  Prediction        Truth
-------------------------
         Neg          Neg
         Pos          Pos
         Neg          Neg
         Pos          Pos
         Neg          Neg
         Pos          Pos
         Neg          Neg


[[0.0850105415875708, 0.9149894584124292],
 [0.6487723414548504, 0.3512276585451496],
 [0.39909478000660176, 0.6009052199933982],
 [0.8202769035004075, 0.1797230964995925],
 [0.19276798260874362, 0.8072320173912564],
 [0.9890997470995166, 0.010900252900483354],
 [0.012830250993056852, 0.9871697490069431]]

## Test with mammal_train.csv and mammal_test.csv

In [11]:
class LogisticRegression(LogisticRegression):
    def score(self, X_test, y_test):#score by accuracy: # correct / total predictions
        predictions = self.predict(X_test)
        numCorrect = 0
        for i in range(len(predictions)):
            correct = y_test.iloc[i]
            if predictions[i] == correct:
                numCorrect += 1
        return float(numCorrect)/float(len(y_test))

In [12]:
mammal_train = pd.read_csv('data/mammal_train.csv')
mammal_test = pd.read_csv('data/mammal_test.csv')

# shuffle the examples 
mammal_train = mammal_train.sample(frac=1).reset_index(drop=True)
mammal_test = mammal_test.sample(frac=1).reset_index(drop=True)
mammal_train["legs"] /= 8 # normalize legs so between 0 and 1
mammal_test["legs"] /= 8 # normalize legs so between 0 and 1

ytrain = mammal_train.pop("animalType")
ytest = mammal_test.pop("animalType")

mammalLR = LogisticRegression(mammal_train.columns, ytrain.unique(), alpha)
mammalLR.fit(mammal_train, ytrain)
mammalPred = mammalLR.predict(mammal_test)

print("Accuracy and Predictions for Logistic Regression: ")
print("Accuracy: %0.2f%%" % (mammalLR.score(mammal_test, ytest)*100))

print("Coefficients and intercept for Logistic Regression: ")
printWeights(mammal_train.columns,  mammalLR.get_weights())

Accuracy and Predictions for Logistic Regression: 
Accuracy: 100.00%
Coefficients and intercept for Logistic Regression: 
	                       Feature     Weight
	                          hair      2.926
	                      feathers     -1.355
	                          eggs     -3.682
	                          milk      4.330
	                      airborne     -1.426
	                       aquatic     -0.601
	                      predator     -0.649
	                       toothed      0.361
	                      backbone      0.441
	                      breathes      0.484
	                      venomous     -1.511
	                          fins     -0.574
	                          legs     -0.523
	                          tail     -0.216
	                      domestic     -0.149
	                       catsize      1.999
	                          bias     -1.804


## Test with titantic.csv

In [13]:
titanicX  = pd.read_csv('data/titanic.csv')
titanicY = titanicX.pop("Survived")
titanicX = normalize(titanicX)

for i in range(2):
    alpha = i
    titanicLR = LogisticRegression(titanicX.columns, titanicY.unique(), alpha)
    titanicLR.fit(titanicX, titanicY)
    titanicPredictions = titanicLR.predict(titanicX)

    print("==== Results for alpha =  " + str(alpha) + "====")
    print("Accuracy and Predictions for Logistic Regression: ")
    print("Accuracy: %0.2f%%" % (titanicLR.score(titanicX, titanicY)*100))
    titanicLR.print_errors(titanicX, titanicY, titanicPredictions, 10)

    print("Coefficients and intercept for Logistic Regression: ")
    printWeights(titanicX.columns,  titanicLR.get_weights())

==== Results for alpha =  0====
Accuracy and Predictions for Logistic Regression: 
Accuracy: 61.62%
10 Errors: 
X_test:  Pclass    0.000000
Sex       1.000000
SibSp     0.125000
Parch     0.000000
Fare      0.139136
Name: 1, dtype: float64 
 prediction:  0 actual:  1


X_test:  Pclass    1.000000
Sex       1.000000
SibSp     0.000000
Parch     0.000000
Fare      0.015469
Name: 2, dtype: float64 
 prediction:  0 actual:  1


X_test:  Pclass    0.000000
Sex       1.000000
SibSp     0.125000
Parch     0.000000
Fare      0.103644
Name: 3, dtype: float64 
 prediction:  0 actual:  1


X_test:  Pclass    1.000000
Sex       1.000000
SibSp     0.000000
Parch     0.333333
Fare      0.021731
Name: 8, dtype: float64 
 prediction:  0 actual:  1


X_test:  Pclass    0.500000
Sex       1.000000
SibSp     0.125000
Parch     0.000000
Fare      0.058694
Name: 9, dtype: float64 
 prediction:  0 actual:  1


X_test:  Pclass    1.000000
Sex       1.000000
SibSp     0.125000
Parch     0.166667
Fare      0.0

In [14]:
# Try to predict for multiple classes

class LogisticRegression(LogisticRegression):
    
    def binary_fit(self, Xinput, Y):#binary version of fit, copied from previous predict function
        """
        train model weights via gradient descent
        """
        X = Xinput.copy()
        X["bias"] = 1#add a bias
        self.weights = np.zeros(len(X.columns))
        self.w = self.weights
        for z in range(MAXITER):
            gradient = np.zeros(len(X.columns))
            for index, xi in X.iterrows(): #might be problem
                p = self.loss(xi)
                yi = self.labelToNumberDict[Y[index]]
                error = p - yi

                for j in range(len(X.columns)):
                  gradient[j] = error*xi.iloc[j]
                self.weights -= alpha*gradient
    
    def binary_predict(self, Xinput):#binary version of predict, copied from previous predict function
        """
        classifies the examples in X.  returns the predicted label for each example in X.
        """
        X = Xinput.copy()
        X["bias"] = 1#add a bias
        predictions = []
        for index, xi in X.iterrows():
            predictions.append(self.labels[int(np.round(self.loss(xi)))])
        return predictions
    
    def fit(self, Xinput, Y):
        """
        multiclass version of the fit function that uses binary fit. 
        train a model for every class that can determine the probability that a given example belongs to a given class.
        """
        self.classToModel = {}
        for k in self.labels:
            y_k = Y.copy()
            for i in range(len(y_k)):
                if y_k.iloc[i] == k:
                    y_k.iloc[i] = 'Pos' #set to a positive 1
                else:
                    y_k.iloc[i] = 'Neg'
            model_k = LogisticRegression(Xinput.columns, y_k.unique(), alpha)
            model_k.binary_fit(Xinput, y_k)
            self.classToModel[k] = model_k
    
    def predict(self, Xinput):#multiclass version of predict
        """
        to classify an example, ask each model for the probability that the example belongs to its class,
        and return the class with the highest probability
        """
        final_predictions = []
        for i in range(len(Xinput)):#examples in probabilities
            exampleXi = Xinput.iloc[[i]] # example row as a dataframe
            positive_prob_k = []#for each class, 
            for k in self.labels:#for each example, predict probability of it being a given model. 
                model_k = self.classToModel[k]
                prob_k = model_k.predict_prob(exampleXi)#assuming that prediction and probability will coincide. 
                prediction = model_k.binary_predict(exampleXi)
                if prediction[0] == 'Pos':#append chance that it is positive for a given class k
                    positive_prob_k.append(max(prob_k[0]))# 'Pos' probability will be the largest one
                else:
                    positive_prob_k.append(min(prob_k[0]))# 'Neg' probability will be the largest one. 
            #use maximum probability to predict the example. 
            maxProbK = 0#find the class with the maximum probability of being positive. 
            maxK = self.labels[0] #must be at least 1 label
            for kIndex in range(len(self.labels)):
                currentProbK = positive_prob_k[kIndex]#probability that our one example is positive for this class k. 
                if currentProbK > maxProbK:
                    maxProbK = currentProbK
                    maxK = self.labels[kIndex]
            final_predictions.append(maxK)
        return final_predictions
    
    def score(self, X_test, y_test):#score by accuracy: # correct / total predictions
        predictions = self.predict(X_test)
        numCorrect = 0
        for i in range(len(predictions)):
            correct = y_test.iloc[i]
            if predictions[i] == correct:
                numCorrect += 1
        return float(numCorrect)/float(len(y_test))

In [15]:
def toyTest():
    print("\ntoy test")
    mc = LogisticRegression(toyX.columns, toyY.unique(),alpha) 
    print(f"type(toyX): {type(toyX)}")
    mc.fit(toyX,toyY)
    print(f"type(toyX): {type(toyX)}")
    predictions = mc.predict(toyX)
    print(f"multi logictic predicitons:\n {predictions}")
    print(f"toy labels: {toyY.to_markdown()}")

def digitsTest():
    print("digits test")
    digits = datasets.load_digits(as_frame=True)
    print(len(digits))
    print(f"digits type: {type(digits)}")
    X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.4, random_state=2)
    digitsX = pd.DataFrame(digits.data, columns=digits.feature_names)
    digitsX['target'] = pd.Series(digits.target)
    digitsY = digitsX.pop("target")
    print("train-test split finished")
    
    mc = LogisticRegression(digitsX.columns, digitsY.unique(),alpha)
    print(f"type(X_train): {type(X_train)}")
    print(f"digitsY: {digitsY}")
    print(f"y_train: {y_train}")
    print("training...")
    mc.fit(digitsX, digitsY)
    print("done training")
    print(f"score: {mc.score(digitsX, digitsY)}")

def zooTest():
    #this part works
    print("\nzoo test")
    X = pd.read_csv('data/zoo.csv')
    Y = X.pop("animalType")
    print("load data")
    mc = LogisticRegression(X.columns, Y.unique(),alpha)
    print("fitting")
    mc.fit(X, Y)#use whole training set
    print("done training")
    print(f"accuracy should be 1.0: {mc.score(X, Y)}")
    
    #train-test split
    X_train, X_test, y_train, y_test = train_test_split(X.copy(), Y.copy(), test_size=0.4, random_state=2)
    mc.fit(X_train, y_train)
    print("done training")
    print(f"accuracy for train-test split: {mc.score(X_test, y_test)}")

toyTest()
zooTest()


toy test
type(toyX): <class 'pandas.core.frame.DataFrame'>
type(toyX): <class 'pandas.core.frame.DataFrame'>
multi logictic predicitons:
 ['Neg', 'Neg', 'Neg', 'Pos', 'Neg', 'Pos', 'Neg']
toy labels: |    | stroke   |
|---:|:---------|
|  0 | Neg      |
|  1 | Pos      |
|  2 | Neg      |
|  3 | Pos      |
|  4 | Neg      |
|  5 | Pos      |
|  6 | Neg      |

zoo test
load data
fitting
done training
accuracy should be 1.0: 1.0
done training
accuracy for train-test split: 0.975609756097561


In [16]:
digitsTest()#takes about 10 minutes to finish running

digits test
7
digits type: <class 'sklearn.utils.Bunch'>
train-test split finished
type(X_train): <class 'pandas.core.frame.DataFrame'>
digitsY: 0       0
1       1
2       2
3       3
4       4
       ..
1792    9
1793    0
1794    8
1795    9
1796    8
Name: target, Length: 1797, dtype: int64
y_train: 399     3
28      8
641     9
821     3
584     4
       ..
1558    3
1608    6
493     1
527     1
1192    5
Name: target, Length: 1078, dtype: int64
training...
done training
score: 0.9404563160823595
