In [1]:
import numpy as np
import pandas as pd

# Step 1 prepare datasets

## Titanic

In [2]:
Titanic=pd.read_csv("Titanic/tested.csv")

In [3]:
Titanic=Titanic.drop(columns=["PassengerId","Ticket","Name"])

In [4]:
Titanic.loc[pd.isna(Titanic.Cabin)==False,"Cabin"]=1

In [5]:
Titanic.loc[pd.isna(Titanic.Cabin)==True,"Cabin"]=0

In [6]:
Titanic.loc[:,"Sex"]=(Titanic.Sex=="male").astype(int)

In [7]:
Titanic.loc[pd.isna(Titanic.Age)==True,"Age"]=Titanic.Age.median()

In [8]:
Titanic=pd.get_dummies(Titanic,columns=["Embarked"])

## Diabetes

In [9]:
Diabetes=pd.read_csv("diabetes/dataset_37_diabetes.csv")

In [10]:
Diabetes.loc[:,"class"]=(Diabetes["class"]=="tested_positive").astype(int)

## Pokemon

In [11]:
Pokemon=pd.read_csv("Pokemon/pokemon.csv")

In [12]:
Pokemon=Pokemon.filter(regex='^(?!against).*')

In [13]:
Pokemon=Pokemon.drop(columns=["abilities","classfication","pokedex_number","japanese_name","name"])

In [14]:
Pokemon.loc[pd.isna(Pokemon.height_m),"height_m"]=0

In [15]:
Pokemon.loc[pd.isna(Pokemon.percentage_male),"percentage_male"]=0

In [16]:
Pokemon.loc[pd.isna(Pokemon.weight_kg),"weight_kg"]=0

In [17]:
Pokemon.capture_rate=pd.to_numeric(Pokemon.capture_rate,errors="coerce")

In [18]:
Pokemon.loc[pd.isna(Pokemon.capture_rate),"capture_rate"]=255

In [19]:
types1=pd.get_dummies(Pokemon["type1"],columns=["type1"])

In [20]:
types2=pd.get_dummies(Pokemon["type2"],columns=["type2"])

In [21]:
Pokemon=Pokemon.drop(columns=["type1","type2"])

In [22]:
Pokemon=pd.concat([Pokemon,types2+types1], axis=1)

## cancer

In [23]:
Cancer=pd.read_csv("cancer/wdbc.data",header=None)

In [24]:
Cancer.iloc[:,1]=(Cancer[1]=="M").astype(int)

In [25]:
Cancer=Cancer.drop(columns=[0])

## Bank

In [26]:
Bank=pd.read_csv("bank/bank.csv",sep=";")

In [27]:
Bank.loc[:,"default"]=(Bank.default=="yes").astype(int)

In [28]:
Bank.loc[:,"housing"]=(Bank.housing=="yes").astype(int)

In [29]:
Bank.loc[:,"loan"]=(Bank.loan=="yes").astype(int)

In [30]:
Bank.loc[:,"y"]=(Bank.y=="yes").astype(int)

In [31]:
Bank=pd.get_dummies(Bank)

# Measures

In [32]:
import numpy.random as rand

In [64]:
def accuracy(Y,Y_hat):
    temp=Y==Y_hat
    return np.mean(temp)

In [65]:
def precision(Y,Y_hat):
    tp=np.logical_and(Y,Y_hat)
    fp=np.logical_and(np.logical_not(Y),Y_hat)
    return np.sum(tp)/(np.sum(tp)+np.sum(fp))

In [66]:
def recall(Y,Y_hat):
    tp=np.logical_and(Y,Y_hat)
    fn=np.logical_and(Y,np.logical_not(Y_hat))
    return np.sum(tp)/(np.sum(tp)+np.sum(fn))

In [67]:
def F_measure(Y,Y_hat):
    pre=precision(Y,Y_hat)
    re=recall(Y,Y_hat)
    return 2*pre*re/(pre+re)

# IWLS

In [315]:
Y=Pokemon["is_legendary"].to_numpy()

In [316]:
X=Pokemon.drop(columns=["is_legendary"]).to_numpy()

In [317]:
X = np.insert(X, 0, values=1, axis=1)

In [318]:
Beta=np.zeros(X.shape[1])

In [319]:
W=np.ones(X.shape[0])*0.5

In [320]:
eps=10**-10

In [321]:
def prob(X,Beta,bias=0):
    return np.exp(np.clip(X @ Beta + bias,-100,100))/(1+np.exp(np.clip(X @ Beta + bias,-100,100)))

In [322]:
EPOCH=200

In [323]:
for i in range(EPOCH):
    Beta+=np.linalg.inv((X.T * W[:,None].T) @ X) @ X.T @ (Y-prob(X , Beta))

    W=(1-prob(X , Beta)) * prob(X , Beta) + eps

In [324]:
Y_hat=prob(X , Beta)

In [325]:
prob(X , Beta)

array([3.72007598e-44, 3.72007598e-44, 3.72007598e-44, 3.72007598e-44,
       3.72007598e-44, 3.72007598e-44, 3.72007598e-44, 3.72007598e-44,
       3.72007598e-44, 3.72007598e-44, 3.72007598e-44, 3.72007598e-44,
       3.72007598e-44, 3.72007598e-44, 3.72007598e-44, 3.72007598e-44,
       3.72007598e-44, 3.72007598e-44, 3.72007598e-44, 3.72007598e-44,
       3.72007598e-44, 3.72007598e-44, 3.72007598e-44, 3.72007598e-44,
       3.72007598e-44, 3.72007598e-44, 3.72007598e-44, 3.72007598e-44,
       3.72007598e-44, 3.72007598e-44, 3.72007598e-44, 3.72007598e-44,
       3.72007598e-44, 3.72007598e-44, 3.72007598e-44, 3.72007598e-44,
       3.72007598e-44, 3.72007598e-44, 3.72007598e-44, 3.72007598e-44,
       3.72007598e-44, 3.72007598e-44, 3.72007598e-44, 3.72007598e-44,
       3.72007598e-44, 3.72007598e-44, 3.72007598e-44, 3.72007598e-44,
       3.72007598e-44, 3.72007598e-44, 3.72007598e-44, 3.72007598e-44,
       3.72007598e-44, 3.72007598e-44, 3.72007598e-44, 3.72007598e-44,
      

In [326]:
Pokemon[Y_hat>0.5]

Unnamed: 0,attack,base_egg_steps,base_happiness,base_total,capture_rate,defense,experience_growth,height_m,hp,percentage_male,...,ghost,grass,ground,ice,normal,poison,psychic,rock,steel,water
143,85,20480,35,580,3.0,100,1250000,1.7,90,0.0,...,0,0,0,1,0,0,0,0,0,0
144,90,20480,35,580,3.0,85,1250000,1.6,90,0.0,...,0,0,0,0,0,0,0,0,0,0
145,100,20480,35,580,3.0,90,1250000,2.0,90,0.0,...,0,0,0,0,0,0,0,0,0,0
149,150,30720,0,780,3.0,70,1250000,2.0,106,0.0,...,0,0,0,0,0,0,1,0,0,0
150,100,30720,100,600,45.0,100,1059860,0.4,100,0.0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
796,101,30720,0,570,25.0,103,1250000,9.2,97,0.0,...,0,0,0,0,0,0,0,0,1,0
797,181,30720,0,570,255.0,131,1250000,0.3,59,0.0,...,0,1,0,0,0,0,0,0,1,0
798,101,30720,0,570,15.0,53,1250000,5.5,223,0.0,...,0,0,0,0,0,0,0,0,0,0
799,107,30720,0,600,3.0,101,1250000,2.4,97,0.0,...,0,0,0,0,0,0,1,0,0,0


In [327]:
precision(Y,Y_hat>0.5)

1.0

In [328]:
recall(Y,Y_hat>0.5)

1.0

In [329]:
from sklearn.linear_model import LogisticRegression

In [330]:
reg = LogisticRegression(max_iter=400).fit(X, Y)

In [331]:
Y_hat=reg.predict(X)

In [332]:
precision(Y,Y_hat)

0.9545454545454546

In [333]:
recall(Y,Y_hat)

0.9

## GD

In [334]:
def MSE(Y,Y_hat):
    return np.sum((Y-Y_hat)**2)

def MSE_grad(Y,Y_hat):
     return Y_hat-Y

def sigmoid(x):
    return(1/(1 + np.exp(np.clip(-x,-100,100))))

def sigmoid_grad(x):
    return(sigmoid(x)*(1-sigmoid(x)))

def cross_entropy(targets,predictions,  epsilon=1e-12):
    predictions = np.clip(predictions, epsilon, 1. - epsilon)
    ce = abs(np.sum(targets*np.log(predictions)))
    return ce

def cross_entropy_grad(Y,Y_hat,eps=10**-10):
    Y_hat=Y_hat+eps
    return (Y_hat-eps-Y)/Y_hat/(1-Y_hat)

In [335]:
Y=Pokemon["is_legendary"].to_numpy()

In [336]:
X=Pokemon.drop(columns=["is_legendary"]).to_numpy()

In [337]:
X=(X-X.mean(0))/X.std(0)

In [338]:
#Beta=np.random.sample(X.shape[1])

In [339]:
Beta=np.zeros(X.shape[1])

In [340]:
bias=0

In [341]:
EPOCH=500
alpha=0.1

In [342]:
for epoch in range(EPOCH):
    pred=sigmoid(X @ Beta + bias)
    error=cross_entropy(Y,pred)
    print(error)
    Beta=Beta - alpha * (X*(cross_entropy_grad(Y,pred) * sigmoid_grad(X @ Beta+bias)).reshape(-1,1)).mean(axis=0)
    bias=bias - alpha * ((cross_entropy_grad(Y,pred) * sigmoid_grad(X @ Beta+bias)).reshape(-1,1)).mean(axis=0)

48.52030263919617
41.880665341086036
36.884940579063795
33.07962564613778
30.135500101646265
27.820557621172544
25.972226279235564
24.475774366375873
23.249144472946156
22.23270599650584
21.382402094020925
20.665150726982283
20.055726501292217
19.534617804262624
19.086532256838424
18.699338531366756
18.363305984091838
18.07055047242902
17.814624958321907
17.590213191400718
17.39289775368107
17.218982426043656
17.06535471210117
16.92937838421502
16.808808714179843
16.701725017217846
16.60647653554704
16.521638692204327
16.445977475187796
16.378420246911855
16.318031669996024
16.263993736243783
16.21558910855289
16.17218715479732
16.13323218231954
16.09823348162005
16.066756865461972
16.03841745031619
16.012873474857066
15.989820988065361
15.968989269650377
15.95013686966869
15.93304817368781
15.917530415614644
15.90341107314735
15.89053559131186
15.878765388174235
15.86797610394364
15.858056060584637
15.848904903971423
15.840432404720671
15.832557397281601
15.825206839757513
15.81831497

In [347]:
Y_hat=prob(X , Beta, bias)

In [348]:
prob(X , Beta, bias)

array([0.0068239 , 0.00993908, 0.02732099, 0.00901791, 0.01347986,
       0.05260881, 0.00718433, 0.0106178 , 0.03158682, 0.00332569,
       0.00388763, 0.01543841, 0.00326603, 0.00383859, 0.02021072,
       0.00368325, 0.00673396, 0.02504802, 0.00198308, 0.00411125,
       0.0036725 , 0.01058696, 0.00647557, 0.016714  , 0.00903485,
       0.03173984, 0.00501304, 0.01221078, 0.0186813 , 0.03263671,
       0.05882557, 0.00260026, 0.00461601, 0.00873481, 0.02666105,
       0.06462633, 0.01973085, 0.05794864, 0.00679838, 0.01697398,
       0.00546763, 0.01803878, 0.01053127, 0.0173471 , 0.02909793,
       0.00882343, 0.01626214, 0.00692412, 0.01775524, 0.00644595,
       0.01716515, 0.00322896, 0.00807805, 0.01232038, 0.03288303,
       0.01230301, 0.02755104, 0.014787  , 0.04727075, 0.01283567,
       0.02039063, 0.04388232, 0.00923956, 0.01639789, 0.0470516 ,
       0.00634885, 0.01155219, 0.02064554, 0.00993907, 0.01786989,
       0.02992953, 0.02059843, 0.05119585, 0.00472683, 0.00865

In [349]:
Pokemon[Y_hat>0.5]

Unnamed: 0,attack,base_egg_steps,base_happiness,base_total,capture_rate,defense,experience_growth,height_m,hp,percentage_male,...,ghost,grass,ground,ice,normal,poison,psychic,rock,steel,water
143,85,20480,35,580,3.0,100,1250000,1.7,90,0.0,...,0,0,0,1,0,0,0,0,0,0
144,90,20480,35,580,3.0,85,1250000,1.6,90,0.0,...,0,0,0,0,0,0,0,0,0,0
145,100,20480,35,580,3.0,90,1250000,2.0,90,0.0,...,0,0,0,0,0,0,0,0,0,0
149,150,30720,0,780,3.0,70,1250000,2.0,106,0.0,...,0,0,0,0,0,0,1,0,0,0
150,100,30720,100,600,45.0,100,1059860,0.4,100,0.0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
796,101,30720,0,570,25.0,103,1250000,9.2,97,0.0,...,0,0,0,0,0,0,0,0,1,0
797,181,30720,0,570,255.0,131,1250000,0.3,59,0.0,...,0,1,0,0,0,0,0,0,1,0
798,101,30720,0,570,15.0,53,1250000,5.5,223,0.0,...,0,0,0,0,0,0,0,0,0,0
799,107,30720,0,600,3.0,101,1250000,2.4,97,0.0,...,0,0,0,0,0,0,1,0,0,0


In [350]:
precision(Y,Y_hat>0.5)

0.9552238805970149

In [351]:
recall(Y,Y_hat>0.5)

0.9142857142857143

# SGD

In [409]:
Y=Pokemon["is_legendary"].to_numpy()

In [410]:
X=Pokemon.drop(columns=["is_legendary"]).to_numpy()

In [411]:
X=(X-X.mean(0))/X.std(0)

In [412]:
#Beta=np.random.sample(X.shape[1])

In [413]:
Beta=np.zeros(X.shape[1])

In [414]:
bias=0

In [415]:
EPOCH=500
alpha=0.1
batch_size=150

In [416]:
batches=np.linspace(0,X.shape[0],int(X.shape[0]/batch_size)+1).astype(int)
for epoch in range(EPOCH):
    s = np.arange(0, X.shape[0], 1)
    np.random.shuffle(s)
    X = X[s,:]
    Y = Y[s]
    for b in range(len(batches)-1):
        X_b=X[batches[b]:batches[b+1],:]
        Y_b=Y[batches[b]:batches[b+1]]
        pred=sigmoid(X_b @ Beta + bias)
        error=cross_entropy(Y_b,pred)
        Beta=Beta - alpha * (X_b*(cross_entropy_grad(Y_b,pred) * sigmoid_grad(X_b @ Beta+bias)).reshape(-1,1)).mean(axis=0)
        bias=bias - alpha * ((cross_entropy_grad(Y_b,pred) * sigmoid_grad(X_b @ Beta+bias)).reshape(-1,1)).mean(axis=0)
    
    pred=sigmoid(X @ Beta + bias)
    error=cross_entropy(Y,pred)
    print(error)

27.816271579658217
21.12325084402702
18.717813389626027
17.320728405923244
16.74975385300942
16.25709956968026
16.11320306117619
15.999766802290202
15.89198347572497
15.883029966630971
15.818365240108095
15.792362804965922
15.75349022825959
15.728181842806798
15.718283504566848
15.696373726594413
15.654920782638433
15.582635730011317
15.529963660830216
15.523588102954257
15.500486244233274
15.462877653726096
15.446802028562807
15.397472719243275
15.358571233541548
15.313522887474269
15.253367592994298
15.200580635568262
15.120477085379296
15.091466279545848
15.04782563677367
14.98461944615616
14.938282281449952
14.876882846815874
14.817857258091571
14.761849264090166
14.719453292419018
14.681709718926212
14.6275507016563
14.589814072838628
14.55222898577659
14.501212449520786
14.466252543394393
14.402880273105264
14.360249621283371
14.319591241906146
14.284164974996827
14.2548132201561
14.200622890566166
14.162300753564022
14.121427329361577
14.07837102077018
14.02438839102162
13.98418

In [417]:
Y_hat=prob(X , Beta, bias)

In [418]:
prob(X , Beta)

array([0.46073209, 0.33122858, 0.32706696, 0.95881509, 0.15690236,
       0.99971221, 0.07914283, 0.00343709, 0.09163227, 0.33123749,
       0.95591491, 0.72585602, 0.99999687, 0.32498324, 0.70540126,
       0.59853348, 0.16044443, 0.02225026, 0.38261677, 0.13598224,
       0.53055452, 0.00712162, 0.78269308, 0.5947421 , 0.57364612,
       0.31976727, 0.5212497 , 0.00740175, 0.92800654, 0.11323848,
       0.29573933, 0.19656469, 0.49319253, 0.99996677, 0.99990363,
       0.01160281, 0.12621489, 0.0101515 , 0.19398083, 0.91786481,
       0.56579725, 0.06278064, 0.04444121, 0.60031012, 0.11194875,
       0.00321733, 0.15445073, 0.50032772, 0.19522853, 0.81541972,
       0.52908145, 0.34253316, 0.45501008, 0.09511426, 0.10599665,
       0.20619058, 0.8119265 , 0.68377162, 0.3433456 , 0.10095276,
       0.99917356, 0.25758641, 0.52980606, 0.01105972, 0.86040129,
       0.99989467, 0.02117491, 0.00230706, 0.07063067, 0.41371957,
       0.51529308, 0.15837714, 0.15772891, 0.01481433, 0.35492

In [419]:
Pokemon[Y_hat>0.5]

Unnamed: 0,attack,base_egg_steps,base_happiness,base_total,capture_rate,defense,experience_growth,height_m,hp,percentage_male,...,ghost,grass,ground,ice,normal,poison,psychic,rock,steel,water
5,104,5120,70,634,45.0,78,1059860,1.7,78,88.1,...,0,0,0,0,0,0,0,0,0,0
12,35,3840,70,195,255.0,30,1000000,0.3,40,50.0,...,0,0,0,0,0,1,0,0,0,0
33,102,5120,70,505,45.0,77,1059860,1.4,81,100.0,...,0,0,1,0,0,1,0,0,0,0
34,45,2560,140,323,150.0,48,800000,0.6,70,24.6,...,0,0,0,0,0,0,0,0,0,0
60,65,5120,70,385,120.0,65,1059860,1.0,65,50.0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
772,95,30720,0,570,3.0,95,1250000,2.3,95,0.0,...,0,0,0,0,1,0,0,0,0,0
778,105,3840,70,475,80.0,70,1000000,0.9,68,50.0,...,0,0,0,0,0,0,1,0,0,1
779,60,5120,70,485,70.0,85,1000000,3.0,78,50.0,...,0,0,0,0,1,0,0,0,0,0
780,131,6400,70,517,25.0,100,1000000,3.9,70,0.0,...,1,1,0,0,0,0,0,0,0,0


In [420]:
precision(Y,Y_hat>0.5)

0.9577464788732394

In [421]:
recall(Y,Y_hat>0.5)

0.9714285714285714