In [46]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

In [47]:
df = pd.read_csv('data-logistic.csv', header=None)
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

In [51]:
def euclid_distance(w1: list, w2: list) -> float:
    """
    Caclc Euclid distance between w1 vector and w2 vector
    """
    res = (w1[0] - w2[0])**2 + (w1[1] - w2[1])**2
    res = res**0.5
    return res


def logistic_regression(X_data: pd.DataFrame,
                        y: pd.DataFrame,
                        k: float = 0.1,
                        w: list = [0, 0],
                        max_iter: int = 10000,
                        stop_euqlid: float = 10**(-5)):
    """
    X_data - признаковое описание объектов
    y - вектор ответов
    k - размер градиентного шага
    w - инициализирующий вектор весов, по умолчанию 0 ,0
    max_iter - максимальное количество итераций
    stop_euqlid - критерий сходимости
    """
    w_old = w
    X1 = X_data.iloc[:, 0]
    X2 = X_data.iloc[:, 1]
    for i in range(max_iter):
        part1 = 0
        part2 = 0
        for j in range(len(X_data)):
            part1 += y[j] * X1[j] * (
                1 - 1 / (1 + np.exp(-y[j] *
                                    (w_old[0] * X1[j] + w_old[1] * X2[j]))))
            part2 += y[j] * X2[j] * (
                1 - 1 / (1 + np.exp(-y[j] *
                                    (w_old[0] * X1[j] + w_old[1] * X2[j]))))
        w1 = w_old[0] + k * part1 / len(X_data)
        w2 = w_old[1] + k * part2 / len(X_data)
        w_new = [w1, w2]
        if euclid_distance(w_old, w_new) <= stop_euqlid:
            print("Final iteration:", i)
            return w_new
        else:
            w_old = w_new
    return w_old


def logistic_regression_regularization(X_data: pd.DataFrame,
                                       y: pd.DataFrame,
                                       k: float = 0.1,
                                       w: list = [0, 0],
                                       C=10,
                                       max_iter: int = 10000,
                                       stop_euqlid: float = 10**(-5)):
    """
    X_data - признаковое описание объектов
    y - вектор ответов
    k - размер градиентного шага
    w - инициализирующий вектор весов, по умолчанию 0 ,0
    max_iter - максимальное количество итераций
    stop_euqlid - критерий сходимости
    C - константа регуляризации
    """
    w_old = w
    X1 = X_data.iloc[:, 0]
    X2 = X_data.iloc[:, 1]
    for i in range(max_iter):
        part1 = 0
        part2 = 0
        for j in range(len(X_data)):
            part1 += y[j] * X1[j] * (
                1 - 1 / (1 + np.exp(-y[j] *
                                    (w_old[0] * X1[j] + w_old[1] * X2[j]))))
            part2 += y[j] * X2[j] * (
                1 - 1 / (1 + np.exp(-y[j] *
                                    (w_old[0] * X1[j] + w_old[1] * X2[j]))))
        w1 = w_old[0] + k * part1 / len(X_data) - k * C * w_old[0]
        w2 = w_old[1] + k * part2 / len(X_data) - k * C * w_old[1]
        w_new = [w1, w2]
        if euclid_distance(w_old, w_new) <= stop_euqlid:
            print("Final iteration:", i)
            return w_new
        else:
            w_old = w_new
    return w_old


def sigmoid(x1, x2, w1, w2):
    """
    sigmoid function
    """
    return 1 / (1 + np.exp(-w1 * x1 - w2 * x2))


def classifier(X_data: pd.DataFrame, w: list) -> pd.DataFrame:
    """
    X_data - признаковое описание объектов
    w - ветор весов
    """
    a = []
    X1 = X_data.iloc[:, 0]
    X2 = X_data.iloc[:, 1]
    for i in range(len(X_data)):
        a.append(sigmoid(X1[i], X2[i], w[0], w[1]))
    return pd.DataFrame(a)

In [55]:
w_ans = logistic_regression(X,y, k=0.001, w=[1/100, 1/100])
ans = classifier(X, w_ans)
print("Non regularization:", roc_auc_score(y, ans))

Final iteration: 5114
Non regularization: 0.9325714285714286


In [56]:
w_ans_reg = logistic_regression_regularization(X,y, k=0.001, w=[1/100, 1/100])
ans_reg = classifier(X, w_ans_reg)
print("Non regularization:", roc_auc_score(y, ans_reg))

Final iteration: 268
Non regularization: 0.936190476190476


In [54]:
with open('w3s3.txt', 'w') as fhand:
    fhand.write(str(np.round(roc_auc_score(y, ans), decimals=3)) + " " + str(np.round(roc_auc_score(y, ans_reg), decimals =3)) )