## 1. KNN

In [1]:
import numpy as np
from sklearn import metrics
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from warnings import simplefilter


simplefilter(action='ignore', category=FutureWarning)


In [3]:
data = load_breast_cancer()
X, y = data['data'], data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True)

Манхетанское расстояние

In [6]:
manhattan = KNeighborsClassifier(
    n_neighbors=3,
    metric='manhattan',
)

manhattan.fit(X_train, y_train)
manhattan_preds = manhattan.predict(X_test)
metrics.f1_score(y_test, manhattan_preds)

0.9595375722543353

Косинусное расстояние

In [7]:
cos = KNeighborsClassifier(
    n_neighbors=3,
    metric='cosine',
)

cos.fit(X_train, y_train)
cos_preds = cos.predict(X_test)
metrics.f1_score(y_test, cos_preds)

0.9418604651162792

## Таска 1

In [8]:
# https://github.com/YakubovSlava/Lections_DS/tree/main/Seminars/03_machine_learning_introduction

class MyKNN:
    def __init__(self, k = 3):
        self.k = k
    
    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
    
    def calculate_euc_distance(self, x, y):
        return np.sqrt(((x - y)**2).sum())
    
    def calculate_matrix(self, X_test, distance):
        distances = np.zeros((X_test.shape[0], self.X_train.shape[0]))
        for i in range(X_test.shape[0]):
            for j in range(self.X_train.shape[0]):
                distances[i, j] = distance(X_test[i], self.X_train[j])
        return distances
    
    def predict(self, X_test, distance):
        matrix = self.calculate_matrix(X_test, distance)
        matrix_idx = np.argsort(matrix, axis=1,)[:, :self.k]
        res_matrix = np.array([self.y_train[x] for x in matrix_idx]).mean(axis=1)
        return res_matrix
    

    # ------------------------------------------------ 

    def calculate_manhattan_distance(self, x, y):
        return np.sum(np.abs(x - y))
    
    def calculate_cos_distance(self, x, y):
        dot = np.dot(x, y)
        norm_x = np.linalg.norm(x)
        norm_y = np.linalg.norm(y)
        distance = dot / (norm_x * norm_y)
        return 1 - distance
    

Реализация по манхеттанскому

In [9]:
my_manhattan = MyKNN(3)

my_manhattan.fit(X_train, y_train)
my_manhattan_preds = my_manhattan.predict(X_test, my_manhattan.calculate_manhattan_distance)
metrics.f1_score(y_test, my_manhattan_preds.astype(int))

0.9454545454545454

Реализация с косинусным расстоянием

In [10]:
my_cos = MyKNN(3)

my_cos.fit(X_train, y_train)
my_cos_preds = my_cos.predict(X_test, my_cos.calculate_cos_distance)
metrics.f1_score(y_test, my_cos_preds.astype(int))

0.9240506329113923

In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [12]:
data = pd.read_csv("automobile.data", header=None, na_values="?")
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [13]:
data.loc[:, data.dtypes == "float64"] = data.loc[:, data.dtypes == "float64"].fillna(data.loc[:, data.dtypes == "float64"].mean(axis=0))
data.loc[:, data.dtypes == "object"] = data.loc[:, data.dtypes == "object"].fillna("")

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,122.0,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,122.0,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [14]:
data_encoded = pd.get_dummies(data)
data_encoded

Unnamed: 0,0,1,9,10,11,12,13,16,18,19,...,15_twelve,15_two,17_1bbl,17_2bbl,17_4bbl,17_idi,17_mfi,17_mpfi,17_spdi,17_spfi
0,3,122.0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,...,0,0,0,0,0,0,0,1,0,0
1,3,122.0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,...,0,0,0,0,0,0,0,1,0,0
2,1,122.0,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,...,0,0,0,0,0,0,0,1,0,0
3,2,164.0,99.8,176.6,66.2,54.3,2337,109,3.19,3.40,...,0,0,0,0,0,0,0,1,0,0
4,2,164.0,99.4,176.6,66.4,54.3,2824,136,3.19,3.40,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,-1,95.0,109.1,188.8,68.9,55.5,2952,141,3.78,3.15,...,0,0,0,0,0,0,0,1,0,0
201,-1,95.0,109.1,188.8,68.8,55.5,3049,141,3.78,3.15,...,0,0,0,0,0,0,0,1,0,0
202,-1,95.0,109.1,188.8,68.9,55.5,3012,173,3.58,2.87,...,0,0,0,0,0,0,0,1,0,0
203,-1,95.0,109.1,188.8,68.9,55.5,3217,145,3.01,3.40,...,0,0,0,0,0,1,0,0,0,0


In [15]:
data_encoded.columns = data_encoded.columns.astype(str)

X = data_encoded.drop(columns=["25"])
y = data_encoded["25"]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
    test_size=0.1,
    random_state=3,
)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((184, 76), (184,), (21, 76), (21,))

In [17]:
scaler = RobustScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)

X_test = scaler.transform(X_test)


Посмотрим как работает линейная регрессия из sklearn

In [18]:
model = LinearRegression().fit(X_train, y_train)
preds = model.predict(X_test)


print(
    f"""
        MSE_test = {metrics.mean_squared_error(preds, y_test)},
        MAE_test = {metrics.mean_absolute_error(preds, y_test)}
        r2_test  = {metrics.r2_score(y_test, preds)}
    """
)


    MSE_test = 4532429.049374135,
    MAE_test = 1604.145364013902
    r2_test = 0.9428782016234765



## Таска 2

In [19]:
class MyLinearRegression:
    def __init__(self, lr=0.001, num_iter=10000, loss='mse', delta=1.0):
        self.lr = lr
        self.num_iter = num_iter
        self.loss = loss
        self.delta = delta

    def calculate_loss(self, y, y_pred):
        error = y - y_pred
        if self.loss == 'mse':
            return (error ** 2).mean()
        elif self.loss == 'huber':
            is_small_error = np.abs(error) <= self.delta
            squared_loss = 0.5 * error**2
            linear_loss = self.delta * (np.abs(error) - 0.5 * self.delta)
            return np.where(is_small_error, squared_loss, linear_loss).mean()
        else:
            raise ValueError("Unsupported loss function. Use 'mse' or 'huber'.")

    def calculate_dldw(self, y, y_pred, x):
        error = y - y_pred
        if self.loss == 'mse':
            grad_w = (-2 * (x.T * error).T).mean(axis=0)
        elif self.loss == 'huber':
            is_small_error = np.abs(error) <= self.delta
            grad_w = (-x.T * np.where(is_small_error, error, self.delta * np.sign(error))).mean(axis=1)
        else:
            raise ValueError("Unsupported loss function. Use 'mse' or 'huber'.")
        return grad_w
    
    def calculate_dldb(self, y, y_pred):
        error = y - y_pred
        if self.loss == 'mse':
            grad_b = (-2 * error).mean()
        elif self.loss == 'huber':
            is_small_error = np.abs(error) <= self.delta
            grad_b = (-np.where(is_small_error, error, self.delta * np.sign(error))).mean()
        else:
            raise ValueError("Unsupported loss function. Use 'mse' or 'huber'.")
        return grad_b

    def fit(self, X_train, y_train):
        X_train = np.array(X_train)
        y_train = np.array(y_train)
        self.weights = np.random.rand(X_train.shape[1])
        self.b = 0
        for i in range(self.num_iter):
            pred = (X_train @ self.weights) + self.b
            self.weights -= self.lr * self.calculate_dldw(y_train, pred, X_train)
            self.b -= self.lr * self.calculate_dldb(y_train, pred)

    def predict(self, X_test):
        return X_test @ self.weights + self.b

In [20]:
my_model = MyLinearRegression(lr=0.001, num_iter=10000, loss='huber', delta=1.0)
my_model.fit(X_train, y_train)
preds = my_model.predict(X_test)
print(
    f"""
        MSE_test = {metrics.mean_squared_error(preds, y_test)},
        MAE_test = {metrics.mean_absolute_error(preds, y_test)}
        r2_test = {metrics.r2_score(y_test, preds)}
    """
)


    MSE_test = 234038903.890219,
    MAE_test = 12457.627844477842
    r2_test = -1.9495713964073391



In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [22]:
data = pd.read_csv("heart.csv")

data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [23]:
data_encoded = pd.get_dummies(data, drop_first=True).astype(float)

data_encoded.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40.0,140.0,289.0,0.0,172.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,49.0,160.0,180.0,0.0,156.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
2,37.0,130.0,283.0,0.0,98.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,48.0,138.0,214.0,0.0,108.0,1.5,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
4,54.0,150.0,195.0,0.0,122.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [24]:
X = data_encoded.drop(columns=["HeartDisease"])
y = data_encoded["HeartDisease"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [25]:
pipe = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=10000),
)

pipe.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(max_iter=10000))])

In [26]:
y_preds = pipe.predict(X_test)
y_preds_probs = pipe.predict_proba(X_test)

print(
    f"""
        Acc     = {accuracy_score(y_test, y_preds)}
        F1      = {f1_score(y_test, y_preds)}
        ROC-AUC = {roc_auc_score(y_test, y_preds_probs[:, 1])}
    """
)


Acc     = 0.8478260869565217
F1      = 0.8571428571428571
ROC-AUC = 0.9073286052009456



## Таска 3

In [27]:
class MyLogReg:
    def __init__(self, lr=0.001, num_iter=1000, penalty=None, C=1.0):
        self.lr = lr
        self.num_iter = num_iter
        self.penalty = penalty
        self.C = C
        self.scaler = StandardScaler()

    def calculate_loss(self, y, y_pred):
        loss = (-y * np.log(y_pred) - (1 - y) * np.log(1 - y_pred)).mean()
        regularization = 0
        if self.penalty == 'l2':
            regularization = (1 / (2 * self.C)) * np.sum(self.weights ** 2)
        elif self.penalty == 'l1':
            regularization = (1 / self.C) * np.sum(np.abs(self.weights))
        return loss + regularization

    def grad_w(self, y, y_pred, x):
        grad_w = (-y + y_pred) @ x / x.shape[0]
        return grad_w

    def grad_w(self, y, y_pred, x):
        grad_w = (-y + y_pred) @ x / x.shape[0]
        if self.penalty == 'l2':
            grad_w += (1 / self.C) * self.weights
        elif self.penalty == 'l1':
            grad_w += (1 / self.C) * np.sign(self.weights)
        return grad_w

    def grad_b(self, y, y_pred):
        grad_b = (-y + y_pred)
        return grad_b.mean()

    def fit(self, X_train, y_train):
        X_train = np.array(X_train)
        self.scaler.fit(X_train)
        X_train = self.scaler.transform(X_train)
        y_train = np.array(y_train)
        self.weights = np.random.rand(X_train.shape[1])
        self.b = 1
        for i in range(self.num_iter):
            pred = self.predict(X_train)
            self.weights -= self.lr * self.grad_w(y_train, pred, X_train)
            self.b -= self.lr * self.grad_b(y_train, pred)

    def predict(self, X_test):
        X_test = np.array(X_test)
        X_test = self.scaler.transform(X_test)
        x = X_test @ self.weights + self.b
        return 1 / (1 + np.exp(-x))

In [28]:
my_logreg = MyLogReg(lr=0.01, num_iter=1000, penalty='l1', C=0.5)
my_logreg.fit(X_train, y_train)

y_preds = (my_logreg.predict(X_test) >= 0.5).astype(int)
y_preds_probs = my_logreg.predict(X_test)

print(f"""
        Acc     = {accuracy_score(y_test, y_preds)}
        F1      = {f1_score(y_test, y_preds)}
        ROC-AUC = {roc_auc_score(y_test, y_preds_probs)}
    """
)


Acc     = 0.5108695652173914
F1      = 0.6762589928057554
ROC-AUC = 0.7385342789598109



In [29]:
my_logreg = MyLogReg(lr=0.01, num_iter=1000, penalty='l2', C=1.1)
my_logreg.fit(X_train, y_train)

y_preds = (my_logreg.predict(X_test) >= 0.5).astype(int)
y_preds_probs = my_logreg.predict(X_test)

print(f"""
        Acc     = {accuracy_score(y_test, y_preds)}
        F1      = {f1_score(y_test, y_preds)}
        ROC-AUC = {roc_auc_score(y_test, y_preds_probs)}
    """
)


Acc     = 0.5869565217391305
F1      = 0.7121212121212122
ROC-AUC = 0.8983451536643026

