Import all libraries and read in dataset

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from random import randint

df = pd.read_csv(
    "../Dataset/adult.data", 
    names=[
        'age',
        'workclass', 
        'fnlwgt',
        'education',
        'education-num',
        'marital_status',
        'occupation',
        'relationship',
        'race',
        'sex',
        'capital-gain',
        'capital-loss',
        'hours-per-week',
        'native-country',
        'income'],
    index_col=False
    )
df


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital_status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,-1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,-1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,-1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,-1
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,-1
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,1
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,-1
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,-1


Data Pre-processing

In [2]:
# Binarise categorical data
df = pd.get_dummies(df)

# Split dataset into a training and validation set
train,validate = train_test_split(df, test_size=0.6)
train,validate

(       age  fnlwgt  education-num  capital-gain  capital-loss  hours-per-week  \
 28781   24   86745             13             0             0              16   
 11166   37  174503             13             0             0              50   
 22434   45  168262             14             0             0              40   
 5022    42  108506              9             0             0              60   
 11789   26   57600             13             0             0              40   
 ...    ...     ...            ...           ...           ...             ...   
 26253   39  195253             15             0             0              45   
 7722    19  374262              8             0             0              20   
 22072   36  166606              7             0             0              40   
 399     52  132178             13             0             0              50   
 7836    26   48718              6          2907             0              40   
 
        income

In [3]:
# Split the train and test datasets into attribute and tags
train_x,train_y = train.drop(columns='income').to_numpy(), train['income'].to_numpy()
validate_x,validate_y = validate.drop(columns='income').to_numpy(), validate['income'].to_numpy()

Implementing the prediction functions

In [4]:
def predict_(model, data):
    results = 0
    # Iterate through each parameter of the model
    for i in range(len(data)):
        
        results += model[i] * data[i]
    
    return results

# Give the prediction in a binary (+ or -) result rather 
# than the continuous prediction of the predict_ function
def predict(model,data):
    return np.sign(predict_(model,data))

Implementing the scoring and error functions of the model

In [5]:
def score(model,X):
    return X.dot(model)

def error(model, X, y):
    pred = predict(model,X)
    
    if pred == y:
        return 0
    else:
        return np.abs(score(model, X))
    
def total_error(model, X, y):
    total_error=0
    for i in range(len(X)):
        total_error+=error(model, X[i], y[i])
    return total_error
        

Implementing the training Algorithm

In [12]:
# This is one iteration/epoch of the model

def iterate_model(model, X, y, learing_rate = 0.01):
    pred = predict(model, X)
    for i in range(len(model)):
        model[i] += (y-pred)*X[i]*learing_rate
    return model

def train_model(X,y, learning_rate=0.01, epochs = 200):
    
    model = np.full((len(X[0])),1, dtype=float)
    errors = []

    for i in range(epochs):
        errors.append(total_error(model,X,y))

        try:
            if errors[i] <= errors[i-5]: break
        except:
            pass
        j = randint(0, len(X)-1)
        model = iterate_model(model, X[j], y[j], learing_rate=learning_rate)
    return model, errors


In [13]:
model,errors = train_model(train_x, train_y, epochs=200)
model, errors

(array([ 4.2000e-01, -5.8622e+02,  8.2000e-01,  1.0000e+00,  1.0000e+00,
         2.4000e-01,  1.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,
         9.8000e-01,  1.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,
         1.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,
         1.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,
         1.0000e+00,  9.8000e-01,  1.0000e+00,  1.0000e+00,  1.0000e+00,
         1.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,
         9.8000e-01,  1.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,
         1.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,
         1.0000e+00,  9.8000e-01,  1.0000e+00,  1.0000e+00,  1.0000e+00,
         1.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,  9.8000e-01,
         1.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,  1.0000e+00,
         1.0000e+00,  1.0000e+00,  1.0000e+00,  9.8000e-01,  9.8000e-01,
         1.0000e+00,  1.0000e+00,  1.0000e+00,  1.0

In [9]:
def validation_error(X,y, model):
    sum_positive = 0
    for i in range(len(y)):
        if predict(model, X[i])==y[i]:
            sum_positive +=1
       
    return sum_positive/len(X)

In [14]:
validation_error(validate_x,validate_y, model)

0.7609151865690741