In [1]:
import pandas as pd
import numpy as np
import re

# --- 1. Исправление "склеенного" CSV ---
with open('../data/data.csv', 'r', encoding='utf-8') as f:
    raw = f.read().strip()

fixed = re.sub(r'(\d)([A-Z])', r'\1\n\2', raw)

with open('../data/data_fixed.csv', 'w', encoding='utf-8') as f:
    f.write(fixed)

# --- 2. Загрузка данных ---
columns = [
    'make', 'model', 'year', 'engine_fuel_type', 'engine_hp', 'engine_cylinders',
    'transmission_type', 'driven_wheels', 'number_of_doors', 'market_category',
    'vehicle_size', 'vehicle_style', 'highway_mpg', 'city_mpg', 'popularity', 'msrp'
]

df = pd.read_csv('../data/data_fixed.csv', header=None, names=columns)

# --- 3. Преобразование числовых колонок ---
numeric_cols = ['year', 'engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'popularity', 'msrp', 'number_of_doors']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# --- 4. Обработка строковых колонок ---
str_cols = df.select_dtypes(include='object').columns
for col in str_cols:
    df[col] = df[col].astype(str).str.lower().str.replace(' ', '_')

# --- 5. Заполнение пропусков ---
df['engine_hp'] = df['engine_hp'].fillna(df['engine_hp'].median())
df['engine_cylinders'] = df['engine_cylinders'].fillna(df['engine_cylinders'].median())
df['number_of_doors'] = df['number_of_doors'].fillna(df['number_of_doors'].median())
df['engine_fuel_type'] = df['engine_fuel_type'].fillna(df['engine_fuel_type'].mode()[0])
df['market_category'] = df['market_category'].fillna('unknown')

# --- Готово ---
print("✅ Данные готовы для feature engineering.")
print("Размер:", df.shape)

✅ Данные готовы для feature engineering.
Размер: (12150, 16)


In [2]:
base = ['engine_hp', 'engine_cylinders', 'highway_mpg', 'city_mpg', 'popularity']

def prepare_X(df):
    df = df.copy()
    features = base.copy()
    
    # 1. Возраст автомобиля (относительно 2017 года — последнего в датасете)
    df['age'] = 2017 - df['year']
    features.append('age')
    
    # 2. Кодирование количества дверей (2, 3, 4)
    for v in [2, 3, 4]:
        feature = f'num_doors_{v}'
        df[feature] = (df['number_of_doors'] == v).astype(int)
        features.append(feature)
    
    # 3. Топ-5 марок (на основе EDA)
    top_makes = ['chevrolet', 'ford', 'toyota', 'nissan', 'honda']
    for make in top_makes:
        feature = f'is_make_{make}'
        df[feature] = (df['make'] == make).astype(int)
        features.append(feature)
    
    # 4. Тип топлива (основные категории)
    fuel_types = [
        'regular unleaded',
        'premium unleaded (required)',
        'premium unleaded (recommended)',
        'flex-fuel (unleaded/e85)',
        'diesel'
    ]
    for fuel in fuel_types:
        feature = f'is_fuel_{fuel.replace(" ", "_").replace("(", "").replace(")", "").replace("/", "_")}'
        df[feature] = (df['engine_fuel_type'] == fuel).astype(int)
        features.append(feature)
    
    # 5. Трансмиссия
    transmissions = ['automatic', 'manual', 'automated_manual']
    for trans in transmissions:
        feature = f'is_trans_{trans}'
        df[feature] = (df['transmission_type'] == trans).astype(int)
        features.append(feature)
    
    # 6. Привод
    drives = ['front wheel drive', 'rear wheel drive', 'all wheel drive', 'four wheel drive']
    for drive in drives:
        feature = f'is_drive_{drive.replace(" ", "_")}'
        df[feature] = (df['driven_wheels'] == drive).astype(int)
        features.append(feature)
    
    # 7. Размер автомобиля
    sizes = ['compact', 'midsize', 'large']
    for size in sizes:
        feature = f'is_size_{size}'
        df[feature] = (df['vehicle_size'] == size).astype(int)
        features.append(feature)
    
    # 8. Стиль кузова (топ-5 по частоте)
    styles = ['sedan', '4dr_suv', 'crew_cab_pickup', 'coupe', '4dr_hatchback']
    for style in styles:
        feature = f'is_style_{style}'
        df[feature] = (df['vehicle_style'] == style).astype(int)
        features.append(feature)
    
    # Формируем матрицу признаков
    df_num = df[features].fillna(0)  # на всякий случай
    X = df_num.values
    return X

In [8]:
# Убедимся, что 'msrp' — числовой и без NaN
df['msrp'] = pd.to_numeric(df['msrp'], errors='coerce')
df = df.dropna(subset=['msrp'])  # УДАЛЯЕМ строки с некорректной ценой

# Теперь логарифмируем
df['log_msrp'] = np.log1p(df['msrp'])

# Разбиение
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=2)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=2)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train['log_msrp'].values
y_val = df_val['log_msrp'].values
y_test = df_test['log_msrp'].values

print(f"Train: {len(df_train)}, Val: {len(df_val)}, Test: {len(df_test)}")

Train: 7007, Val: 2336, Test: 2336


In [9]:
# Протестируем prepare_X на обучающей выборке
X_train = prepare_X(df_train)
print("Форма X_train:", X_train.shape)
print("Количество признаков:", X_train.shape[1])

Форма X_train: (7007, 34)
Количество признаков: 34


In [10]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])  # добавляем регуляризацию
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    return w[0], w[1:]

def rmse(y_true, y_pred):
    return np.sqrt(((y_pred - y_true) ** 2).mean())

# Обучение модели с регуляризацией
X_train = prepare_X(df_train)
w0, w = train_linear_regression_reg(X_train, y_train, r=0.01)

# Валидация
X_val = prepare_X(df_val)
y_pred_val = w0 + X_val.dot(w)
rmse_val = rmse(y_val, y_pred_val)
print(f"RMSE на валидации: {rmse_val:.4f}")

RMSE на валидации: 0.4966


In [11]:
# Обратное преобразование предсказаний и целевой переменной
y_pred_val_exp = np.expm1(y_pred_val)
y_val_exp = np.expm1(y_val)

# RMSE в долларах
rmse_dollars = rmse(y_val_exp, y_pred_val_exp)
print(f"RMSE на валидации (в долларах): ${rmse_dollars:,.0f}")

RMSE на валидации (в долларах): $40,564
