In [2]:
import pandas as pd
import numpy as np
import re

# --- Вспомогательные функции ---
def rmse(y_true, y_pred):
    return np.sqrt(((y_pred - y_true) ** 2).mean())

def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    return w[0], w[1:]

In [4]:
def prepare_X(df):
    df = df.copy()
    base = ['engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'popularity']
    features = base.copy()
    
    # Возраст
    df['age'] = 2017 - df['year']
    features.append('age')
    
    # Двери
    for v in [2, 3, 4]:
        feature = f'num_doors_{v}'
        df[feature] = (df['number_of_doors'] == v).astype(int)
        features.append(feature)
    
    # Топ-5 марок
    top_makes = ['ford', 'chevrolet', 'toyota', 'nissan', 'honda']
    for make in top_makes:
        feature = f'is_make_{make}'
        df[feature] = (df['make'] == make).astype(int)
        features.append(feature)
    
    # Тип топлива
    fuel_types = [
        'regular unleaded',
        'premium unleaded (required)',
        'premium unleaded (recommended)',
        'flex-fuel (unleaded/e85)',
        'diesel'
    ]
    for fuel in fuel_types:
        clean_name = fuel.replace(' ', '_').replace('(', '').replace(')', '').replace('/', '_')
        feature = f'is_fuel_{clean_name}'
        df[feature] = (df['engine_fuel_type'] == fuel).astype(int)
        features.append(feature)
    
    # Трансмиссия
    transmissions = ['automatic', 'manual', 'automated_manual']
    for trans in transmissions:
        feature = f'is_trans_{trans}'
        df[feature] = (df['transmission_type'] == trans).astype(int)
        features.append(feature)
    
    # Привод
    drives = ['front wheel drive', 'rear wheel drive', 'all wheel drive', 'four wheel drive']
    for drive in drives:
        clean_drive = drive.replace(' ', '_')
        feature = f'is_drive_{clean_drive}'
        df[feature] = (df['driven_wheels'] == drive).astype(int)
        features.append(feature)
    
    # Размер
    sizes = ['compact', 'midsize', 'large']
    for size in sizes:
        feature = f'is_size_{size}'
        df[feature] = (df['vehicle_size'] == size).astype(int)
        features.append(feature)
    
    # Стиль кузова
    styles = ['sedan', '4dr_suv', 'crew_cab_pickup', 'coupe', '4dr_hatchback']
    for style in styles:
        feature = f'is_style_{style}'
        df[feature] = (df['vehicle_style'] == style).astype(int)
        features.append(feature)
    
    # Матрица признаков
    df_num = df[features].fillna(0)
    X = df_num.values
    return X

In [5]:
# --- Загрузка и обработка данных (полная) ---
with open('../data/data.csv', 'r', encoding='utf-8') as f:
    raw = f.read().strip()

fixed = re.sub(r'(\d)([A-Z])', r'\1\n\2', raw)
with open('../data/data_fixed.csv', 'w', encoding='utf-8') as f:
    f.write(fixed)

columns = [
    'make', 'model', 'year', 'engine_fuel_type', 'engine_hp', 'engine_cylinders',
    'transmission_type', 'driven_wheels', 'number_of_doors', 'market_category',
    'vehicle_size', 'vehicle_style', 'highway_mpg', 'city_mpg', 'popularity', 'msrp'
]
df = pd.read_csv('../data/data_fixed.csv', header=None, names=columns)

# Числовые колонки
numeric_cols = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'popularity', 'msrp', 'number_of_doors']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Строковые колонки
str_cols = df.select_dtypes(include='object').columns
for col in str_cols:
    df[col] = df[col].astype(str).str.lower().str.replace(' ', '_')

# Удаление строк с некорректной ценой
df = df.dropna(subset=['msrp'])
df['log_msrp'] = np.log1p(df['msrp'])

# Разбиение
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=2)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=2)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train['log_msrp'].values
y_val = df_val['log_msrp'].values
y_test = df_test['log_msrp'].values

print(f"Train: {len(df_train)}, Val: {len(df_val)}, Test: {len(df_test)}")

Train: 7007, Val: 2336, Test: 2336


In [6]:
# Подготовка данных
X_train = prepare_X(df_train)
X_val = prepare_X(df_val)

# Диапазон значений регуляризации
r_values = [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10]
scores = []

for r in r_values:
    w0, w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    scores.append(score)
    print(f'r = {r:>8} → RMSE = {score:.5f}')

# Выбор лучшего r
best_r = r_values[np.argmin(scores)]
best_rmse = min(scores)
print(f'\n✅ Лучший r = {best_r}, RMSE = {best_rmse:.5f}')

r =    1e-05 → RMSE = 0.49923
r =   0.0001 → RMSE = 0.49923
r =    0.001 → RMSE = 0.49924
r =     0.01 → RMSE = 0.49934
r =      0.1 → RMSE = 0.50036
r =        1 → RMSE = 0.50687
r =       10 → RMSE = 0.52221

✅ Лучший r = 1e-05, RMSE = 0.49923


In [7]:
def prepare_X_with_interactions(df):
    df = df.copy()
    base = ['engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'popularity']
    features = base.copy()
    
    # Возраст автомобиля
    df['age'] = 2017 - df['year']
    features.append('age')
    
    # Базовые категориальные признаки
    top_makes = ['ford', 'chevrolet', 'toyota', 'nissan', 'honda']
    for make in top_makes:
        feature = f'is_make_{make}'
        df[feature] = (df['make'] == make).astype(int)
        features.append(feature)
    
    # Взаимодействия: мощность × марка
    for make in top_makes:
        feat_name = f'hp_x_{make}'
        df[feat_name] = df['engine_hp'] * (df['make'] == make).astype(int)
        features.append(feat_name)
    
    # Взаимодействия: возраст × марка
    for make in top_makes:
        feat_name = f'age_x_{make}'
        df[feat_name] = df['age'] * (df['make'] == make).astype(int)
        features.append(feat_name)
    
    # Взаимодействие: тип топлива × цилиндры
    df['fuel_premium_x_cyl'] = df['engine_cylinders'] * (df['engine_fuel_type'] == 'premium unleaded (required)').astype(int)
    features.append('fuel_premium_x_cyl')
    
    # Остальные признаки (двери, трансмиссия и т.д.) — добавим позже при необходимости
    # На данном этапе ограничимся ключевыми
    
    df_num = df[features].fillna(0)
    X = df_num.values
    return X

In [8]:
# Обучение с взаимодействиями
X_train_int = prepare_X_with_interactions(df_train)
X_val_int = prepare_X_with_interactions(df_val)

w0, w = train_linear_regression_reg(X_train_int, y_train, r=1e-05)
y_pred_int = w0 + X_val_int.dot(w)
rmse_int = rmse(y_val, y_pred_int)

print(f"RMSE с взаимодействиями: {rmse_int:.5f}")

RMSE с взаимодействиями: 0.50205


начало лог.регрессии

In [9]:
# Определим диапазоны цен на основе квантилей
price_quantiles = df['msrp'].quantile([0.33, 0.66]).values
low_threshold, high_threshold = price_quantiles

def price_class(price):
    if price <= low_threshold:
        return 0  # "дешёвый"
    elif price <= high_threshold:
        return 1  # "средний"
    else:
        return 2  # "дорогой"

# Применяем к выборкам
df_train['price_class'] = df_train['msrp'].apply(price_class)
df_val['price_class'] = df_val['msrp'].apply(price_class)
df_test['price_class'] = df_test['msrp'].apply(price_class)

y_train_clf = df_train['price_class'].values
y_val_clf = df_val['price_class'].values

print(f"Пороги: дешёвый ≤ ${low_threshold:,.0f}, средний ≤ ${high_threshold:,.0f}")
print("Распределение классов (train):")
print(pd.Series(y_train_clf).value_counts().sort_index())

Пороги: дешёвый ≤ $24,017, средний ≤ $36,625
Распределение классов (train):
0    2296
1    2323
2    2388
Name: count, dtype: int64


In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Подготовка признаков (используем ту же функцию, что и для регрессии)
X_train_clf = prepare_X(df_train)
X_val_clf = prepare_X(df_val)

# Обучение логистической регрессии
clf = LogisticRegression(max_iter=1000, random_state=2)
clf.fit(X_train_clf, y_train_clf)

# Предсказания
y_pred_clf = clf.predict(X_val_clf)

# Оценка качества
acc = accuracy_score(y_val_clf, y_pred_clf)
print(f"Accuracy на валидации: {acc:.4f}")
print("\nClassification Report:")
print(classification_report(y_val_clf, y_pred_clf, target_names=['Дешёвый', 'Средний', 'Дорогой']))

Accuracy на валидации: 0.7551

Classification Report:
              precision    recall  f1-score   support

     Дешёвый       0.85      0.84      0.84       788
     Средний       0.64      0.61      0.62       757
     Дорогой       0.77      0.81      0.79       791

    accuracy                           0.76      2336
   macro avg       0.75      0.75      0.75      2336
weighted avg       0.75      0.76      0.75      2336



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [11]:
from sklearn.linear_model import Ridge

# Подготовка данных
X_train = prepare_X(df_train)
X_val = prepare_X(df_val)

# Ваша реализация
w0, w = train_linear_regression_reg(X_train, y_train, r=1e-05)
y_pred_custom = w0 + X_val.dot(w)
rmse_custom = rmse(y_val, y_pred_custom)

# Sklearn Ridge
ridge = Ridge(alpha=1e-05, solver='cholesky')
ridge.fit(X_train, y_train)
y_pred_sklearn = ridge.predict(X_val)
rmse_sklearn = rmse(y_val, y_pred_sklearn)

print(f"RMSE (ваша реализация): {rmse_custom:.5f}")
print(f"RMSE (sklearn Ridge):    {rmse_sklearn:.5f}")
print(f"Разница:                 {abs(rmse_custom - rmse_sklearn):.6f}")

RMSE (ваша реализация): 0.49923
RMSE (sklearn Ridge):    0.49923
Разница:                 0.000000


In [12]:
import pickle

# Обучаем финальную модель на полных тренировочных данных
X_train_full = prepare_X(df_train)
w0_final, w_final = train_linear_regression_reg(X_train_full, y_train, r=1e-05)

# Сохраняем веса модели
model_weights = {'w0': w0_final, 'w': w_final}
with open('../models/model_weights.pkl', 'wb') as f:
    pickle.dump(model_weights, f)

print("✅ Модель сохранена в '../models/model_weights.pkl'")

✅ Модель сохранена в '../models/model_weights.pkl'


In [14]:
def predict_price(car_dict, model_path='../models/model_weights.pkl'):
    """
    Предсказывает цену автомобиля по его характеристикам.
    
    car_dict: словарь с ключами, соответствующими колонкам датасета
    Пример:
        {
            'make': 'toyota',
            'model': 'rav4',
            'year': 2017,
            'engine_fuel_type': 'regular unleaded',
            'engine_hp': 176,
            'engine_cylinders': 4,
            'transmission_type': 'automatic',
            'driven_wheels': 'all wheel drive',
            'number_of_doors': 4,
            'market_category': 'crossover',
            'vehicle_size': 'midsize',
            'vehicle_style': '4dr suv',
            'highway_mpg': 28,
            'city_mpg': 22,
            'popularity': 2031
        }
    """
    # Загружаем модель
    with open(model_path, 'rb') as f:
        weights = pickle.load(f)
    w0, w = weights['w0'], weights['w']
    
    # Создаём датафрейм из словаря
    df_input = pd.DataFrame([car_dict])
    
    # Подготавливаем признаки
    X_input = prepare_X(df_input)
    
    # Предсказание в лог-масштабе
    log_price_pred = w0 + X_input.dot(w)
    
    # Обратное преобразование в доллары
    price_pred = np.expm1(log_price_pred[0])
    
    return price_pred

In [15]:
# Пример автомобиля
new_car = {
    'make': 'toyota',
    'model': 'rav4',
    'year': 2017,
    'engine_fuel_type': 'regular unleaded',
    'engine_hp': 176,
    'engine_cylinders': 4,
    'transmission_type': 'automatic',
    'driven_wheels': 'all wheel drive',
    'number_of_doors': 4,
    'market_category': 'crossover',
    'vehicle_size': 'midsize',
    'vehicle_style': '4dr suv',
    'highway_mpg': 28,
    'city_mpg': 22,
    'popularity': 2031
}

predicted_price = predict_price(new_car)
print(f"Предсказанная цена: ${predicted_price:,.0f}")

Предсказанная цена: $26,745


📌 Итоги выполнения всего плана:
✅ Подбор гиперпараметра регуляризации r → r = 1e-05
✅ Добавление взаимодействий признаков → не улучшило RMSE, решение принято обоснованно
✅ Логистическая регрессия для классификации диапазонов цен → Accuracy = 75.5%, сбалансированные классы
✅ Сравнение с sklearn.linear_model.Ridge → RMSE идентичен, реализация корректна
✅ Сохранение модели и функция предсказания → работает корректно