In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import recall_score, f1_score, accuracy_score, classification_report

In [2]:
def sigmoid(x):
    t = x / np.max(x)
    return 1. / (1. + np.exp(-t))

In [3]:
train_df = pd.read_csv("../Dodge/data/defect/camel-1.2.csv")
test_df = pd.read_csv("../Dodge/data/defect/camel-1.4.csv")

In [4]:
x_train = train_df.iloc[:,3:-1]
y_train = train_df["bug"]
y_train = y_train.map(lambda p: 0 if p == 0 else 1)
x_test = test_df.iloc[:,3:-1]
y_test = test_df["bug"]
y_test = y_test.map(lambda p: 0 if p == 0 else 1)

In [5]:
x_train.shape

(608, 20)

In [6]:
x_train = np.concatenate((x_train, np.ones((x_train.shape[0], 1))), axis=1)

In [7]:
x_train.shape

(608, 21)

In [8]:
x_test = np.concatenate((x_test, np.ones((x_test.shape[0], 1))), axis=1)

In [9]:
def hypothesis(theta, x):
    return sigmoid(np.dot(x, theta))

In [15]:
theta = np.random.randn(x_train.shape[1])

In [16]:
idx = np.where(y_train == 1)
w = np.ones((x_train.shape[0],))
w[idx] = len(y_train) * .5 / sum(y_train)

In [17]:
def cost(y, yhat):
    return -1./len(y) * (sum(w * y * np.log(yhat) + (1.- w * y) * (1. - np.log(yhat))))

In [18]:
yhat = hypothesis(theta, x_train)

In [19]:
cost(y_train, yhat)

-0.505357120570933

In [171]:
def step():
    global theta, x_train, yhat, w, y_train
    theta -= 0.2 / x_train.shape[0] * sum(np.dot(x_train.T, (yhat -  y_train)))

In [172]:
step()

In [173]:
cost(y_train, yhat)

-0.44337301914425425

In [174]:
for i in range(100):
    yhat = hypothesis(theta, x_train)
    print("it", i, "|", "cost", cost(y_train, yhat))
    step()

it 0 | cost -0.5052669982138362
it 1 | cost -0.5050428678917611
it 2 | cost nan
it 3 | cost -0.505376414072418
it 4 | cost -0.5053636541232791
it 5 | cost -0.5053429546110455
it 6 | cost -0.5053035986399644
it 7 | cost -0.5051999850065314
it 8 | cost -0.5042073242090025
it 9 | cost -0.5054730708184334
it 10 | cost -0.5055350937675329
it 11 | cost -0.5055414510525221
it 12 | cost -0.5055437845824049
it 13 | cost -0.5055449956556664
it 14 | cost -0.5055457370574965
it 15 | cost -0.5055462376143469
it 16 | cost -0.5055465982803843
it 17 | cost -0.505546870506684
it 18 | cost -0.5055470832693174
it 19 | cost -0.5055472541328158
it 20 | cost -0.5055473943642422
it 21 | cost -0.5055475115232918
it 22 | cost -0.505547610871229
it 23 | cost -0.505547696182684
it 24 | cost -0.5055477702360871
it 25 | cost -0.5055478351219651
it 26 | cost -0.5055478924434322
it 27 | cost -0.5055479434504089
it 28 | cost -0.5055479891318184
it 29 | cost -0.5055480302803276
it 30 | cost -0.5055480675387299
it 31 |

  This is separate from the ipykernel package so we can avoid doing imports until
  


In [175]:
h = hypothesis(theta, x_test)
h[h >= .5] = 1
h[h < .5] = 0
recall_score(y_test, h)

1.0

In [176]:
f1_score(y_test, h)

0.28515240904621436

In [177]:
print(classification_report(y_test, h))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       727
           1       0.17      1.00      0.29       145

    accuracy                           0.17       872
   macro avg       0.08      0.50      0.14       872
weighted avg       0.03      0.17      0.05       872



## sklearn impl

In [20]:
from sklearn.linear_model import LogisticRegression

In [24]:
clf = LogisticRegression(class_weight="balanced", penalty="l1", solver="liblinear")

In [25]:
clf.fit(x_train, y_train)

LogisticRegression(class_weight='balanced', penalty='l1', solver='liblinear')

In [26]:
preds = clf.predict(x_test)

In [27]:
f1_score(y_test, preds)

0.3767441860465116

In [28]:
clf = LogisticRegression(penalty="l1", solver="liblinear")

In [29]:
clf.fit(x_train, y_train)
preds = clf.predict(x_test)

In [30]:
f1_score(y_test, preds)

0.3134328358208956