In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score, mean_absolute_error
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

In [2]:
# 1. Загрузка данных
df = pd.read_csv('D:/my_ML/diploma_polytech/data/raw/vehicle_ins_data_1.csv', sep = ";",index_col= False)

# 2. Вычисление целевой переменной
df['claim_flag'] = (df['N_claims_year'] > 1).astype(int)

# 3. Разделение на признаки и цель
X = df.drop(columns=['N_claims_year', 'claim_flag'])
y = df['claim_flag']

# Разделим на train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)



  df = pd.read_csv('D:/my_ML/diploma_polytech/data/raw/vehicle_ins_data_1.csv', sep = ";",index_col= False)


In [5]:
# 4. Предобработка признаков
# Разделим на числовые и категориальные
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

# Преобразуем все категориальные колонки в строки для OneHotEncoder
for col in cat_cols:
    X_train[col] = X_train[col].astype(str)
    X_test[col] = X_test[col].astype(str)

scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train[num_cols])
X_test_num = scaler.transform(X_test[num_cols])

ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')  # также исправлен параметр sparse_output
X_train_cat = ohe.fit_transform(X_train[cat_cols])
X_test_cat = ohe.transform(X_test[cat_cols])

MemoryError: Unable to allocate 24.7 GiB for an array with shape (84444, 39190) and data type float64

In [None]:
# 5. PyTorch Dataset
class InsuranceDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32).unsqueeze(1)
    def __len__(self):
        return len(self.y)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = InsuranceDataset(X_train_proc, y_train)
test_ds = InsuranceDataset(X_test_proc, y_test)

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=64)

# 6. Определение модели
class ClaimProbNet(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)



In [None]:
model = ClaimProbNet(input_dim=X_train_proc.shape[1])
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

# 7. Обучение
epochs = 30
for epoch in range(1, epochs + 1):
    model.train()
    epoch_loss = 0.0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        preds = model(X_batch)
        loss = criterion(preds, y_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item() * X_batch.size(0)
    epoch_loss /= len(train_ds)
    print(f'Epoch {epoch}/{epochs} – Loss: {epoch_loss:.4f}')



In [None]:
# 8. Оценка на тестовой выборке
model.eval()
y_true = []
y_prob = []
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        probs = model(X_batch).squeeze().numpy()
        y_prob.extend(probs.tolist())
        y_true.extend(y_batch.squeeze().numpy().tolist())

roc_auc = roc_auc_score(y_true, y_prob)
mae = mean_absolute_error(y_true, y_prob)
print(f'ROC-AUC: {roc_auc:.4f}')
print(f'MAE: {mae:.4f}')