# Lab Three - Extending Logistic Regression

Use Sex, Age, country, and marital-status to predict education

## Data Preprocessing
The data below is being processed identically to how it was in Lab One

In [573]:
import pandas as pd
import numpy as np
import missingno as mn

initData = pd.read_csv('/home/tommy/Downloads/abalone.data')
initData.groupby(['Rings']).size()
duplicates = initData.duplicated().loc[initData.duplicated() == True].count()
print("Number of duplicate values: ", duplicates)
initData.info()

Number of duplicate values:  0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Sex             4177 non-null   object 
 1   Length          4177 non-null   float64
 2   Diameter        4177 non-null   float64
 3   Height          4177 non-null   float64
 4   Whole-Weight    4177 non-null   float64
 5   Shucked-Weight  4177 non-null   float64
 6   Viscera-Weight  4177 non-null   float64
 7   Shell-Weight    4177 non-null   float64
 8   Rings           4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


In [574]:
print(initData.groupby(['Rings']).size())
initData.head()

Rings
1       1
2       1
3      15
4      57
5     115
6     259
7     391
8     568
9     689
10    634
11    487
12    267
13    203
14    126
15    103
16     67
17     58
18     42
19     32
20     26
21     14
22      6
23      9
24      2
25      1
26      1
27      2
29      1
dtype: int64


Unnamed: 0,Sex,Length,Diameter,Height,Whole-Weight,Shucked-Weight,Viscera-Weight,Shell-Weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [575]:
# One Hot Encoding Sex
temp = pd.get_dummies(initData['Sex'])
initData = pd.concat([initData, temp], axis=1).drop('Sex', axis=1)
initData.rename(columns={'F': 'Female', 'M': 'Male','I': 'Infant'}, inplace = True)

#Discretizing Ring Size
initData['Ring-Range'] = pd.cut(initData['Rings'],[0,8,10,30],
                                 labels=['1-8','9-10','11-30']) 
del initData['Rings']
initData.head()

Unnamed: 0,Length,Diameter,Height,Whole-Weight,Shucked-Weight,Viscera-Weight,Shell-Weight,Female,Infant,Male,Ring-Range
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,0,0,1,11-30
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,0,0,1,1-8
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,1,0,0,9-10
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,0,0,1,9-10
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,0,1,0,1-8


I realize that 1-8, 9-10, and 11-30, are odd choices for ring ranges, but I wanted to ensure the number of entries per range was as similar as possible.

In [576]:
print(initData.groupby(['Ring-Range']).size())

Ring-Range
1-8      1407
9-10     1323
11-30    1447
dtype: int64


In [577]:
print(initData.isnull().values.any())
initData.info()

False
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   Length          4177 non-null   float64 
 1   Diameter        4177 non-null   float64 
 2   Height          4177 non-null   float64 
 3   Whole-Weight    4177 non-null   float64 
 4   Shucked-Weight  4177 non-null   float64 
 5   Viscera-Weight  4177 non-null   float64 
 6   Shell-Weight    4177 non-null   float64 
 7   Female          4177 non-null   uint8   
 8   Infant          4177 non-null   uint8   
 9   Male            4177 non-null   uint8   
 10  Ring-Range      4177 non-null   category
dtypes: category(1), float64(7), uint8(3)
memory usage: 245.0 KB


Next, I'll scale all the continuous variables, so that they rest between 0 and 1.

In [578]:
continuous_vars = ['Length', 'Diameter', 'Height', 'Whole-Weight', 'Shucked-Weight', 'Viscera-Weight', 'Shell-Weight']
for v in continuous_vars:
    # Taken from: https://www.geeksforgeeks.org/normalize-a-column-in-pandas/
    initData[v] = (initData[v]-initData[v].min())/(initData[v].max()-initData[v].min())

initData.head()

Unnamed: 0,Length,Diameter,Height,Whole-Weight,Shucked-Weight,Viscera-Weight,Shell-Weight,Female,Infant,Male,Ring-Range
0,0.513514,0.521008,0.084071,0.181335,0.150303,0.132324,0.147982,0,0,1,11-30
1,0.371622,0.352941,0.079646,0.079157,0.066241,0.063199,0.068261,0,0,1,1-8
2,0.614865,0.613445,0.119469,0.239065,0.171822,0.185648,0.207773,1,0,0,9-10
3,0.493243,0.521008,0.110619,0.182044,0.14425,0.14944,0.152965,0,0,1,9-10
4,0.344595,0.336134,0.070796,0.071897,0.059516,0.05135,0.053313,0,1,0,1-8


In [579]:
from sklearn import model_selection as ms

y = initData['Ring-Range'].values
X = initData.drop('Ring-Range', axis=1).to_numpy()


X_train, X_test, y_train, y_test = ms.train_test_split(X, y, test_size=0.2)

# temp2 = y_train.to_numpy()
# count1 = (temp2 == '1-8').sum()
# count2 = (temp2 == '9-10').sum()
# count3 = (temp2 == '11-30').sum()
# print(count1, '|', count2, '|', count3)
# temp2 = y_test.to_numpy()
# count1 = (temp2 == '1-8').sum()
# count2 = (temp2 == '9-10').sum()
# count3 = (temp2 == '11-30').sum()
# print(count1, '|', count2, '|', count3)

In [580]:
# np.set_printoptions(threshold=np.inf)
# print(y_train.to_numpy())
#y_train.loc[y_train.isna()]
initData.isnull().values.any()

False

In [581]:
import numpy as np
from scipy.special import expit
class BinaryLogisticRegressionBase:
    # private:
    def __init__(self, eta, iterations=20):
        self.eta = eta
        self.iters = iterations
        # internally we will store the weights as self.w_ to keep with sklearn conventions
    
    def __str__(self):
        return 'Base Binary Logistic Regression Object, Not Trainable'
    
    # convenience, private and static:
    @staticmethod
    def _sigmoid(theta):
        return 1/(1+np.exp(-theta)) 
    
    @staticmethod
    def _add_bias(X):
        return np.hstack((np.ones((X.shape[0],1)),X)) # add bias term
    
    # public:
    def predict_proba(self, X, add_bias=True):
        # add bias term if requested
        Xb = self._add_bias(X) if add_bias else X
        return self._sigmoid(Xb @ self.w_) # return the probability y=1
    
    def predict(self,X):
        return (self.predict_proba(X)>0.5) #return the actual prediction
    

# inherit from base class
class BinaryLogisticRegression(BinaryLogisticRegressionBase):
    #private:
    def __str__(self):
        if(hasattr(self,'w_')):
            return 'Binary Logistic Regression Object with coefficients:\n'+ str(self.w_) # is we have trained the object
        else:
            return 'Untrained Binary Logistic Regression Object'
        
    def _get_gradient(self,X,y):
        # programming \sum_i (yi-g(xi))xi
        gradient = np.zeros(self.w_.shape) # set gradient to zero
        for (xi,yi) in zip(X,y):
            # the actual update inside of sum
            gradi = (yi - self.predict_proba(xi,add_bias=False))*xi 
            # reshape to be column vector and add to gradient
            gradient += gradi.reshape(self.w_.shape) 
        
        return gradient/float(len(y))
       
    # public:
    def fit(self, X, y):
        Xb = self._add_bias(X) # add bias term
        num_samples, num_features = Xb.shape
        
        self.w_ = np.zeros((num_features,1)) # init weight vector to zeros
        
        # for as many as the max iterations
        for _ in range(self.iters):
            gradient = self._get_gradient(Xb,y)
            self.w_ += gradient*self.eta # multiply by learning rate 


class VectorBinaryLogisticRegression(BinaryLogisticRegression):
    # inherit from our previous class to get same functionality
    @staticmethod
    def _sigmoid(theta):
        # increase stability, redefine sigmoid operation
        return expit(theta) #1/(1+np.exp(-theta))
    
    # but overwrite the gradient calculation
    def _get_gradient(self,X,y):
        ydiff = y-self.predict_proba(X,add_bias=False).ravel() # get y difference
        gradient = np.mean(X * ydiff[:,np.newaxis], axis=0) # make ydiff a column vector and multiply through
        
        return gradient.reshape(self.w_.shape)

class LogisticRegression:
    def __init__(self, eta, iterations=20):
        self.eta = eta
        self.iters = iterations
        # internally we will store the weights as self.w_ to keep with sklearn conventions
    
    def __str__(self):
        if(hasattr(self,'w_')):
            return 'MultiClass Logistic Regression Object with coefficients:\n'+ str(self.w_) # is we have trained the object
        else:
            return 'Untrained MultiClass Logistic Regression Object'
        
    def fit(self,X,y):
        num_samples, num_features = X.shape
        self.unique_ = np.unique(y) # get each unique class value
        num_unique_classes = len(self.unique_)
        self.classifiers_ = [] # will fill this array with binary classifiers
        
        for i,yval in enumerate(self.unique_): # for each unique value
            y_binary = (y==yval) # create a binary problem
            # train the binary classifier for this class
            blr = VectorBinaryLogisticRegression(self.eta,
                                                 self.iters)
            blr.fit(X,y_binary)
            # add the trained classifier to the list
            self.classifiers_.append(blr)
            
        # save all the weights into one matrix, separate column for each class
        self.w_ = np.hstack([x.w_ for x in self.classifiers_]).T
        
    def predict_proba(self,X):
        probs = []
        for blr in self.classifiers_:
            probs.append(blr.predict_proba(X)) # get probability for each classifier
        
        return np.hstack(probs) # make into single matrix
    
    def predict(self,X):
        return self.unique_[np.argmax(self.predict_proba(X),axis=1)] # take argmax along row

In [582]:
# lr = LogisticRegression(0.1,500)
# lr.fit(X_train,y_train)
# lr

In [583]:
# from sklearn.metrics import accuracy_score

# yhat = lr.predict(X_test)
# print('Accuracy of: ',accuracy_score(y_test,yhat))

In [584]:
from sklearn.linear_model import LogisticRegression as SKLogisticRegression
# from sklearn.datasets import load_iris
# ds = load_iris()
# X = ds.data
# y = ds.target
lr_sk = SKLogisticRegression(solver='liblinear', max_iter=5000) # all params default

lr_sk.fit(X_train,y_train)
yhat = lr_sk.predict(X_test)
print('Accuracy of: ',accuracy_score(y_test,yhat))

Accuracy of:  0.6196172248803827


In [585]:

np.set_printoptions(threshold=np.inf)
initData.drop('Ring-Range', axis=1)

Unnamed: 0,Length,Diameter,Height,Whole-Weight,Shucked-Weight,Viscera-Weight,Shell-Weight,Female,Infant,Male
0,0.513514,0.521008,0.084071,0.181335,0.150303,0.132324,0.147982,0,0,1
1,0.371622,0.352941,0.079646,0.079157,0.066241,0.063199,0.068261,0,0,1
2,0.614865,0.613445,0.119469,0.239065,0.171822,0.185648,0.207773,1,0,0
3,0.493243,0.521008,0.110619,0.182044,0.144250,0.149440,0.152965,0,0,1
4,0.344595,0.336134,0.070796,0.071897,0.059516,0.051350,0.053313,0,1,0
...,...,...,...,...,...,...,...,...,...,...
4172,0.662162,0.663866,0.146018,0.313441,0.248151,0.314022,0.246637,1,0,0
4173,0.695946,0.647059,0.119469,0.341420,0.294553,0.281764,0.258097,0,0,1
4174,0.709459,0.705882,0.181416,0.415796,0.352724,0.377880,0.305431,0,0,1
4175,0.743243,0.722689,0.132743,0.386931,0.356422,0.342989,0.293473,1,0,0


In [586]:
print(X)

[[0.51351351 0.5210084  0.0840708  0.18133522 0.15030262 0.1323239
  0.14798206 0.         0.         1.        ]
 [0.37162162 0.35294118 0.07964602 0.07915707 0.06624075 0.06319947
  0.06826109 0.         0.         1.        ]
 [0.61486486 0.61344538 0.11946903 0.23906499 0.17182246 0.18564845
  0.2077728  1.         0.         0.        ]
 [0.49324324 0.5210084  0.11061947 0.18204356 0.14425017 0.14944042
  0.15296462 0.         0.         1.        ]
 [0.34459459 0.33613445 0.07079646 0.07189658 0.0595158  0.05134957
  0.0533134  0.         1.         0.        ]
 [0.47297297 0.41176471 0.0840708  0.12378254 0.09414929 0.10138249
  0.1180867  0.         1.         0.        ]
 [0.61486486 0.60504202 0.13274336 0.27465911 0.15870881 0.18564845
  0.32735426 1.         0.         0.        ]
 [0.63513514 0.62184874 0.11061947 0.27129449 0.19704102 0.1961817
  0.25759841 1.         0.         0.        ]
 [0.54054054 0.52941176 0.11061947 0.17974146 0.14492266 0.14746544
  0.16292975 0

In [587]:
# # # relationship
# # temp = pd.get_dummies(sacData['relationship'])
# # temp
# # sacData = pd.concat([sacData, temp], axis=1).drop('relationship', axis=1)

# # native-country
# temp = pd.get_dummies(sacData['native-country'])
# temp
# sacData = pd.concat([sacData, temp], axis=1).drop('native-country', axis=1)

# # # occupation
# # temp = pd.get_dummies(sacData['occupation'])
# # temp
# # sacData = pd.concat([sacData, temp], axis=1).drop('occupation', axis=1)

# # #above-50k
# # temp = pd.get_dummies(sacData['above-50k'], drop_first=True)
# # del sacData['above-50k']
# # temp.rename(columns={True: 'above-50k'}, inplace = True)
# # sacData = pd.concat([sacData, temp], axis=1)
# # # initData['above-50k'] = initData['above-50k'].astype('uint8')

# # # race
# # temp = pd.get_dummies(sacData['race'])
# # temp
# # sacData = pd.concat([sacData, temp], axis=1).drop('race', axis=1)

# # # workclass
# # temp = pd.get_dummies(sacData['workclass'])
# # temp
# # sacData = pd.concat([sacData, temp], axis=1).drop('workclass', axis=1)

# # marital-status
# temp = pd.get_dummies(sacData['marital-status'])
# temp
# sacData = pd.concat([sacData, temp], axis=1).drop('marital-status', axis=1)

# # sex
# temp = pd.get_dummies(sacData['sex'], drop_first=True)
# temp
# sacData = pd.concat([sacData, temp], axis=1).drop('sex', axis=1)

# # age (normalization)
# sacData['age'] = (sacData['age']-sacData['age'].min())/(sacData['age'].max()-sacData['age'].min())

# del sacData['workclass']
# del sacData['fnlwgt']
# del sacData['occupation']
# del sacData['relationship']
# del sacData['capital-gain']
# del sacData['capital-loss']
# del sacData['hours-per-week']
# del sacData['above-50k']
# del sacData['race']
# sacData.info()