## Лабораторная работа №2: Проведение исследований с логистической и линейной регрессией

### 1. Выбор начальных условий

In [1]:
!pip install scikit-learn numpy pandas matplotlib seaborn



In [2]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
date_fruit_path = "/kaggle/input/date-fruit-datasets"

print("Path to Date Fruit dataset files:", date_fruit_path)

# Проверка содержимого папки
if os.path.exists(date_fruit_path):
    files = os.listdir(date_fruit_path)
    print("Files:", files)
else:
    print("Not found")

Path to Date Fruit dataset files: /kaggle/input/date-fruit-datasets
Files: ['Date_Fruit_Datasets']


In [4]:
# Чтение Excel-файла
date_fruit_data = pd.read_excel(f"{date_fruit_path}/Date_Fruit_Datasets/Date_Fruit_Datasets.xlsx")

# Проверка данных
date_fruit_data.head()

Unnamed: 0,AREA,PERIMETER,MAJOR_AXIS,MINOR_AXIS,ECCENTRICITY,EQDIASQ,SOLIDITY,CONVEX_AREA,EXTENT,ASPECT_RATIO,...,KurtosisRR,KurtosisRG,KurtosisRB,EntropyRR,EntropyRG,EntropyRB,ALLdaub4RR,ALLdaub4RG,ALLdaub4RB,Class
0,422163,2378.908,837.8484,645.6693,0.6373,733.1539,0.9947,424428,0.7831,1.2976,...,3.237,2.9574,4.2287,-59191263232,-50714214400,-39922372608,58.7255,54.9554,47.84,BERHI
1,338136,2085.144,723.8198,595.2073,0.569,656.1464,0.9974,339014,0.7795,1.2161,...,2.6228,2.635,3.1704,-34233065472,-37462601728,-31477794816,50.0259,52.8168,47.8315,BERHI
2,526843,2647.394,940.7379,715.3638,0.6494,819.0222,0.9962,528876,0.7657,1.315,...,3.7516,3.8611,4.7192,-93948354560,-74738221056,-60311207936,65.4772,59.286,51.9378,BERHI
3,416063,2351.21,827.9804,645.2988,0.6266,727.8378,0.9948,418255,0.7759,1.2831,...,5.0401,8.6136,8.2618,-32074307584,-32060925952,-29575010304,43.39,44.1259,41.1882,BERHI
4,347562,2160.354,763.9877,582.8359,0.6465,665.2291,0.9908,350797,0.7569,1.3108,...,2.7016,2.9761,4.4146,-39980974080,-35980042240,-25593278464,52.7743,50.908,42.6666,BERHI


In [5]:
concrete_strength_path = "/kaggle/input/concrete-compressive-strength"

print("Путь до датасета Concrete Compressive Strength:", concrete_strength_path)

# Проверка содержимого папки
if os.path.exists(concrete_strength_path):
    files = os.listdir(concrete_strength_path)
    print("Содержание:", files)
else:
    print("Не найден")

Путь до датасета Concrete Compressive Strength: /kaggle/input/concrete-compressive-strength
Содержание: ['Concrete Compressive Strength.csv']


In [6]:
# Чтение CSV-файла
concrete_data = pd.read_csv(f"{concrete_strength_path}/Concrete Compressive Strength.csv")

# Проверка данных
concrete_data.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age (day),Concrete compressive strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.986111
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.887366
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.269535
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05278
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.296075


### 2. Создание бейзлайна и оценка качества

In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score, make_scorer
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline

Разделим датасет для классификации на обучающую и тестовую выборки

In [8]:
# Разделение на признаки и целевую переменную
X_class = date_fruit_data.drop(columns=['Class'])
y_class = date_fruit_data['Class']

# Разделение на обучающую и тестовую выборки
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X_class, y_class, test_size=0.2, random_state=42, stratify=y_class
)

# Преобразование целевой переменной
label_encoder = LabelEncoder()
y_train_class = label_encoder.fit_transform(y_train_class)
y_test_class = label_encoder.transform(y_test_class)
     

Аналогично разделим датасет для регрессии

In [9]:
# Разделение на признаки и целевую переменную
X_reg = concrete_data.drop(columns=['Concrete compressive strength '])
y_reg = concrete_data['Concrete compressive strength ']

# Разделение на обучающую и тестовую выборки
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)
    

Обучим модели для классификации и регрессии из Sklearn и оценим их. Для логистической регрессии нам требуется обязательный препроцессинг данных с помощью StandardScaler.

In [10]:
scaler = StandardScaler()
X_train_class = scaler.fit_transform(X_train_class)
X_test_class = scaler.transform(X_test_class)

logistic = LogisticRegression(max_iter=10000)
logistic.fit(X_train_class, y_train_class)

y_pred_class = logistic.predict(X_test_class)

accuracy = accuracy_score(y_test_class, y_pred_class)
f1 = f1_score(y_test_class, y_pred_class, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1-Score: {f1:.4f}")

Accuracy: 0.9278
F1-Score: 0.9263


In [11]:
linear_regressor = LinearRegression()
linear_regressor.fit(X_train_reg, y_train_reg)

y_pred_reg = linear_regressor.predict(X_test_reg)

rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)
r2 = r2_score(y_test_reg, y_pred_reg)

print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")
     

RMSE: 9.7967
R²: 0.6275


Итак, точность для встроенной в Sklearn модели логистической регрессии получилась отличной (92.7% точности). Для модели линейной регрессии среднеквадратичная ошибка составила 9.79, что несколько хуже бейзлайна для KNN-регрессора. Попробуем улучшить бейзлайн.

### 3. Улучшение бейзлайна

Для улучшения бейзлайна для логистичекой регрессии будем подбирать гиперпараметр C с помощью GridSearchCV. Для линейной регрессии добавим регуляризацию через Ridge (L2).

In [12]:


# Пайплайн с нормализацией и логистической регрессией
pipeline_logistic = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('logistic', LogisticRegression(max_iter=10000, random_state=42))
])

# Параметры для подбора
param_grid_logistic = {
    'logistic__C': [0.01, 0.1, 1, 10, 75, 1000],
    'logistic__penalty': ['l2'],
    'logistic__solver': ['lbfgs']
}

# Подбор гиперпараметров
grid_search_logistic = GridSearchCV(
    pipeline_logistic,
    param_grid_logistic,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

grid_search_logistic.fit(X_train_class, y_train_class)

best_params_logistic = grid_search_logistic.best_params_
best_score_logistic = grid_search_logistic.best_score_

print("\n" + "="*50)
print(f"Лучшие параметры для Logistic Regression: {best_params_logistic}")
print(f"Лучший Accuracy на кросс-валидации: {best_score_logistic:.4f}")

# Оценка на тестовой выборке
y_pred_class = grid_search_logistic.best_estimator_.predict(X_test_class)
accuracy = accuracy_score(y_test_class, y_pred_class)
f1 = f1_score(y_test_class, y_pred_class, average='weighted')

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test F1-Score: {f1:.4f}")


Fitting 5 folds for each of 6 candidates, totalling 30 fits

Лучшие параметры для Logistic Regression: {'logistic__C': 1, 'logistic__penalty': 'l2', 'logistic__solver': 'lbfgs'}
Лучший Accuracy на кросс-валидации: 0.9150
Test Accuracy: 0.9278
Test F1-Score: 0.9263


In [13]:
param_grid_ridge = {
    'alpha': [0.001, 0.01, 0.1, 1, 5, 10, 50, 100, 500, 1000, 5000]  
}

grid_search_ridge = GridSearchCV(
    Ridge(random_state=42), 
    param_grid_ridge,
    cv=5,
    scoring=['neg_root_mean_squared_error', 'r2'],  
    refit='neg_root_mean_squared_error',  
    verbose=1,
    n_jobs=-1 
)

grid_search_ridge.fit(X_train_reg, y_train_reg)

best_params_ridge = grid_search_ridge.best_params_
best_rmse_score = -grid_search_ridge.best_score_ 

cv_results = grid_search_ridge.cv_results_

print(f"Лучшие параметры для Ridge Regression: {best_params_ridge}")
print(f"Лучший RMSE на кросс-валидации: {best_rmse_score:.4f}")

best_idx = grid_search_ridge.best_index_
print(f"Лучшее alpha: {best_params_ridge['alpha']}")
print(f"Соответствующий R² на кросс-валидации: {cv_results['mean_test_r2'][best_idx]:.4f}")

# Оценка на тестовой выборке
ridge_regressor = grid_search_ridge.best_estimator_
y_pred_reg = ridge_regressor.predict(X_test_reg)

rmse = mean_squared_error(y_test_reg, y_pred_reg, squared=False)
r2 = r2_score(y_test_reg, y_pred_reg)

print(f"\nРезультаты на тестовой выборке:")
print(f"Test RMSE: {rmse:.4f}")
print(f"Test R²:   {r2:.4f}")

Fitting 5 folds for each of 11 candidates, totalling 55 fits
Лучшие параметры для Ridge Regression: {'alpha': 5000}
Лучший RMSE на кросс-валидации: 10.6490
Лучшее alpha: 5000
Соответствующий R² на кросс-валидации: 0.5957

Результаты на тестовой выборке:
Test RMSE: 9.7956
Test R²:   0.6276


Итак, результаты лишь немного улучшились по сравнению с бейзлайном.

### 4. Имплементация алгоритма машинного обучения
Напишем собственную реализацию Logistic and Linear regression, затем обучим модели на тестовых данных и сравним по качеству с реализациями из Sklearn.

In [14]:
sigmoid_function = lambda z: np.clip(1 / (1 + np.exp(-np.clip(z, -500, 500))), 1e-10, 1 - 1e-10)

def optimize_gradients(gradient_fn, start_point, learning_rate, max_iter, tolerance=1e-6):
    current_point = start_point
    for i in range(max_iter):
        grad = gradient_fn(current_point)
        current_point -= learning_rate * grad
        
        if np.linalg.norm(grad) < tolerance:
            print(f"Градиент сошелся на итерации {i}")
            break
    return current_point


class CustomLogisticRegression:
    def __init__(self, *, lr=0.01, max_epochs=1000, add_intercept=True):
        self._learning_rate = lr
        self._max_epochs = max_epochs
        self._add_intercept = add_intercept
        self._parameters = None
        self._X_train = None
        self._y_train = None
        self._X_train_mean = None
        self._X_train_std = None

    def _add_intercept_to_X(self, X):
        if self._add_intercept:
            return np.hstack([np.ones((X.shape[0], 1)), X])
        return X

    def _compute_gradient(self, params):
        assert self._X_train is not None
        assert self._y_train is not None

        samples = self._X_train.shape[0]
        predictions = sigmoid_function(np.dot(self._X_train, params))
        
        grad = np.dot(self._X_train.T, (predictions - self._y_train)) / samples
        return grad

    def fit(self, X, y):
        assert self._parameters is None

        X_array = np.array(X)
        y_array = np.array(y).reshape(-1, 1)
        
        # Нормализация признаков для стабильности
        if X_array.shape[1] > 0:
            self._X_train_mean = np.mean(X_array, axis=0)
            self._X_train_std = np.std(X_array, axis=0) + 1e-8
            X_normalized = (X_array - self._X_train_mean) / self._X_train_std
            self._X_train = self._add_intercept_to_X(X_normalized)
        else:
            self._X_train = self._add_intercept_to_X(X_array)
        
        self._y_train = y_array
        features = self._X_train.shape[1]
        
        initial_params = np.zeros((features, 1))
        
        self._parameters = optimize_gradients(
            self._compute_gradient, 
            initial_params, 
            self._learning_rate, 
            self._max_epochs
        ).flatten()

    def predict_proba(self, X):
        assert self._parameters is not None
        
        X_array = np.array(X)
        
        # Применяем ту же нормализацию, что при обучении
        if self._X_train_mean is not None:
            X_normalized = (X_array - self._X_train_mean) / self._X_train_std
            X_with_intercept = self._add_intercept_to_X(X_normalized)
        else:
            X_with_intercept = self._add_intercept_to_X(X_array)
            
        z = np.dot(X_with_intercept, self._parameters)
        return sigmoid_function(z)

    def predict(self, X, threshold=0.5):
        probabilities = self.predict_proba(X)
        return (probabilities >= threshold).astype(int)


class CustomLinearRegression:
    def __init__(self, add_intercept=True):
        self._add_intercept = add_intercept
        self._coef_ = None
        self._X_mean = None
        self._X_std = None

    def _add_intercept_to_X(self, X):
        if self._add_intercept:
            return np.hstack([np.ones((X.shape[0], 1)), X])
        return X

    def fit(self, X, y):
        X_array = np.array(X)
        y_array = np.array(y).reshape(-1, 1)
        
        # Нормализация признаков
        if X_array.shape[1] > 0:
            self._X_mean = np.mean(X_array, axis=0)
            self._X_std = np.std(X_array, axis=0) + 1e-8
            X_normalized = (X_array - self._X_mean) / self._X_std
            X_intercept = self._add_intercept_to_X(X_normalized)
        else:
            X_intercept = self._add_intercept_to_X(X_array)
        
        self._coef_ = np.linalg.pinv(X_intercept) @ y_array
        
        return self

    def predict(self, X):
        X_array = np.array(X)
        
        # Применяем нормализацию
        if self._X_mean is not None:
            X_normalized = (X_array - self._X_mean) / self._X_std
            X_intercept = self._add_intercept_to_X(X_normalized)
        else:
            X_intercept = self._add_intercept_to_X(X_array)
            
        return (X_intercept @ self._coef_).flatten()

Обучим модели и оценим их качество

In [15]:
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score

log_reg = CustomLogisticRegression(lr=0.1, max_epochs=1000)
log_reg.fit(X_train_class, y_train_class)
y_pred_class = log_reg.predict(X_test_class)

accuracy = accuracy_score(y_test_class, y_pred_class)
f1 = f1_score(y_test_class, y_pred_class, average="weighted")


print(f"Custom Logistic Regression - Accuracy: {accuracy:.4f}, F1-Score: {f1:.4f}")

lin_reg = CustomLinearRegression()
lin_reg.fit(X_train_reg, y_train_reg)
y_pred_reg = lin_reg.predict(X_test_reg)

rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred_reg))
r2 = r2_score(y_test_reg, y_pred_reg)

print(f"Custom Linear Regression - RMSE: {rmse:.4f}, R²: {r2:.4f}")

Custom Logistic Regression - Accuracy: 0.0611, F1-Score: 0.0189
Custom Linear Regression - RMSE: 9.7967, R²: 0.6275


Точность и F1-Score логистической модели сильно упали, а показатели линейной - остались примерно теми же.