In [174]:
#!pip install -q numpy==1.26.4

In [175]:
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sweetviz as sv
import torch
import torch.nn as nn
import optuna

from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, f1_score
from phik import phik_matrix
from sqlalchemy import create_engine

In [176]:
warnings.filterwarnings('ignore')

In [177]:
RANDOM_STATE = 0
TEST_SIZE = 0.25
torch.manual_seed(RANDOM_STATE)
torch.use_deterministic_algorithms(True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [178]:
print(torch.cuda.is_available())

False


In [179]:
def get_data() -> pd.DataFrame:
    try:
        db_config = {
            'user': 'praktikum_student',
            'pwd': 'Sdf4$2;d-d30pp', 
            'host': 'rc1b-wcoijxj3yxfsf3fs.mdb.yandexcloud.net',
            'port': 6432,
            'db': 'data-science-vehicle-db'
        } 
        connection_string = 'postgresql://{}:{}@{}:{}/{}'.format(
            db_config['user'],
            db_config['pwd'],
            db_config['host'],
            db_config['port'],
            db_config['db']
        )
        engine = create_engine(connection_string)
        
        query = '''
            SELECT
                c.*,
                p.party_number, p.at_fault, p.insurance_premium, p.party_sobriety, p.party_drug_physical, p.cellphone_in_use,
                v.vehicle_type, v.vehicle_transmission, v.vehicle_age
            FROM
                collisions c
            INNER JOIN
                parties p ON c.case_id = p.case_id
            INNER JOIN
                vehicles v ON c.case_id = v.case_id
            WHERE
                c.collision_date BETWEEN '2012-01-01' AND '2012-12-31'
                AND c.collision_damage != 'scratch'
                AND p.party_type = 'car';
        '''
        data = pd.read_sql_query(query, con=engine)
        return data
    except Exception:
        print(f'Error: {Exception}')
        return None

In [180]:
data = get_data()

In [181]:
data = data.drop_duplicates()
data.case_id = data.case_id.astype('int')
data = data[data['distance'] < 4000.0]
data.direction = data.direction.fillna('unknown').astype('category')
data.intersection = data.intersection.apply(lambda x: 'Y' if x == 1. else 
                                                      'N' if x == 0 else x).fillna('unknown').astype('category')
data.weather_1 = data.weather_1.fillna('unknown').astype('category')
data.location_type = data.location_type.fillna('unknown').astype('category')
data.collision_damage = data.collision_damage.astype('category')
data.party_count = data.party_count.apply(lambda x: '5+' if x > 4 else x).apply(
    lambda x: 'one' if x == 1 else
              'two' if x == 2 else
              'three' if x == 3 else
              'four' if x == 4 else
              'five_plus' if str(x) == '5+' else x).astype('category')
data.primary_collision_factor = data.primary_collision_factor.fillna('unknown').astype('category')
data.pcf_violation_category = data.pcf_violation_category.fillna('unknown').astype('category')
data.type_of_collision = data.type_of_collision.fillna('other').astype('category')
data.motor_vehicle_involved_with = data.motor_vehicle_involved_with.fillna('unknown').astype('category')
data.road_surface = data.road_surface.fillna('unknown').astype('category')
data.road_condition_1 = data.road_condition_1.fillna('other').astype('category')
data.lighting = data.lighting.fillna('unknown').astype('category')
data.control_device = data.control_device.fillna('unknown').astype('category')
data.collision_date = pd.to_datetime(data.collision_date)
data['collision_month'] = data.collision_date.dt.month
data['collision_day'] = data.collision_date.dt.day
data.collision_time = pd.to_datetime(data.collision_time, format='%H:%M:%S')
data['collision_hour'] = data.collision_time.dt.hour
data.party_number = data.party_number.apply(lambda x: '5+' if x > 4 else x).apply(
    lambda x: 'one' if x == 1 else
              'two' if x == 2 else
              'three' if x == 3 else
              'four' if x == 4 else
              'five_plus' if str(x) == '5+' else x).astype('category')
data.at_fault = data.at_fault.astype('category')
data.party_sobriety = data.party_sobriety.fillna('unknown').astype('category')
data.party_drug_physical = data.party_drug_physical.fillna('unknown').astype('category')
data.cellphone_in_use = data.cellphone_in_use.fillna('3.0').apply(
    lambda x: 'no' if x == 0 else
              'yes' if x == 1 else
              'unknown' if x == 3 else x).astype('category')
data.vehicle_type = data.vehicle_type.astype('category')
data.vehicle_transmission = data.vehicle_transmission.fillna('unknown').astype('category')
data = data[data['vehicle_age'] < 161.0]
data['insurance_premium'] = data['insurance_premium'].fillna(-1)
data['collision_hour'] = data['collision_hour'].interpolate()


In [182]:
# report = sv.analyze(data)
# report.show_notebook()

In [183]:
# plt.figure(figsize=(15, 13))
# sns.heatmap(phik_matrix(data[[
#     'county_location',
#     'intersection',
#     'weather_1',
#     'location_type',
#     'collision_damage',
#     'party_count',
#     'primary_collision_factor',
#     'pcf_violation_category',
#     'type_of_collision',
#     'motor_vehicle_involved_with',
#     'road_surface',
#     'road_condition_1',
#     'lighting',
#     'control_device',
#     'party_number',
#     'at_fault',
#     'party_sobriety',
#     'party_drug_physical',
#     'cellphone_in_use',
#     'vehicle_transmission',
# ]], interval_cols=['distance', 'insurance_premium', 'vehicle_age', 'collision_month', 'collision_day', 'collision_hour']), annot=True, cmap='coolwarm')
# plt.show()

In [184]:
num_cols = [
    'distance', 
    'insurance_premium', 
    'vehicle_age', 
    'collision_month', 
    'collision_day', 
    'collision_hour'
]
cat_cols = [
    'county_location',
    'intersection',
    'weather_1',
    'location_type',
    'collision_damage',
    'party_count',
    'primary_collision_factor',
    'pcf_violation_category',
    'type_of_collision',
    'motor_vehicle_involved_with',
    'road_surface',
    'road_condition_1',
    'lighting',
    'control_device',
    'party_number',
    'party_sobriety',
    'party_drug_physical',
    'cellphone_in_use',
    'vehicle_transmission'
]
all_colls = num_cols + cat_cols

In [185]:
X = data[all_colls]
y = data.at_fault

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    shuffle=True,
    stratify=y)

In [186]:
def data_preporation(
        numeric_cols: list = num_cols,
        category_cols: list = cat_cols,
        X_train: pd.DataFrame = X_train,
        X_test: pd.DataFrame = X_test
) -> tuple[pd.DataFrame, pd.DataFrame]:
    
    preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numeric_cols),
        ('cat', OrdinalEncoder(
            handle_unknown='use_encoded_value', 
            unknown_value=np.nan                     
        ), category_cols)
    ], remainder='passthrough')

    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    feature_names = preprocessor.get_feature_names_out()
    X_train = pd.DataFrame(X_train_processed, columns=feature_names)
    X_test = pd.DataFrame(X_test_processed, columns=feature_names)
    return X_train, X_test

X_train, X_test = data_preporation()

In [187]:
X_train_torch = torch.FloatTensor(X_train.values)
X_test_torch = torch.FloatTensor(X_test.values)
y_train_torch = torch.FloatTensor(y_train.values)
y_test_torch = torch.FloatTensor(y_test.values)

train_dataset_torch = TensorDataset(X_train_torch, y_train_torch)
test_dataset_torch = TensorDataset(X_test_torch, y_test_torch)

In [188]:
print(X_train.isna().sum())

num__distance                       0
num__insurance_premium              0
num__vehicle_age                    0
num__collision_month                0
num__collision_day                  0
num__collision_hour                 0
cat__county_location                0
cat__intersection                   0
cat__weather_1                      0
cat__location_type                  0
cat__collision_damage               0
cat__party_count                    0
cat__primary_collision_factor       0
cat__pcf_violation_category         0
cat__type_of_collision              0
cat__motor_vehicle_involved_with    0
cat__road_surface                   0
cat__road_condition_1               0
cat__lighting                       0
cat__control_device                 0
cat__party_number                   0
cat__party_sobriety                 0
cat__party_drug_physical            0
cat__cellphone_in_use               0
cat__vehicle_transmission           0
dtype: int64


## Написание модели pytorch

In [189]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_units, dropout_rate):
        super(Net, self).__init__()

        self.layers = nn.Sequential(
            nn.Linear(input_size, hidden_units),
            nn.BatchNorm1d(hidden_units),
            nn.ReLU(),
            nn.Dropout(dropout_rate),

            nn.Linear(hidden_units, hidden_units//2),
            nn.BatchNorm1d(hidden_units//2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),

            nn.Linear(hidden_units//2, hidden_units//4),
            nn.BatchNorm1d(hidden_units//4),
            nn.ReLU(),
            nn.Dropout(dropout_rate),

            nn.Linear(hidden_units//4, hidden_units//8),
            nn.BatchNorm1d(hidden_units//8),
            nn.ReLU(),
            nn.Dropout(dropout_rate),

            nn.Linear(hidden_units//8, 1),
            #nn.Sigmoid()
        )

    def forward(self, x):
        return self.layers(x)

In [190]:
def objective(trial):
    params = {
        'hidden_units': trial.suggest_categorical('hidden_units', [256, 512, 1024, 2048]),
        'dropout_rate': trial.suggest_float('dropout_rate', 0.1, 0.5),
        'lr': trial.suggest_float('lr', 1e-4, 1e-2, log=True),
        'batch_size': trial.suggest_categorical('batch_size', [64, 128, 256, 512, 1024, 2048])
    }

    print(f"\n=== Starting Trial {trial.number} ===")
    print(f"Parameters: {params}")

    train_loader = DataLoader(train_dataset_torch,
                              batch_size=params['batch_size'],
                              shuffle=True)
    
    net = Net(
        input_size=X_train.shape[1],
        hidden_units=params['hidden_units'],
        dropout_rate=params['dropout_rate']
    ).to(device)

    optimizer = torch.optim.Adam(net.parameters(), lr=params['lr'])
    loss_fn = nn.BCEWithLogitsLoss()

    best_f1 = 0
    patience = 5
    no_improve = 0
    num_epoch = 2000

    for epoch in range(num_epoch):
        net.train()
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            train_preds = net(inputs)
            train_loss = loss_fn(train_preds, labels.unsqueeze(1))
            train_loss.backward()
            optimizer.step()
        
        if epoch % 5 == 0 or epoch == num_epoch -1:
            net.eval()
            with torch.no_grad():
                test_inputs, test_labels = X_test_torch.to(device), y_test_torch.to(device)
                outputs = net(test_inputs)
                test_preds = (outputs > 0.5).float()
                test_f1 = f1_score(test_labels.cpu(), test_preds.cpu())
                
                if test_f1 > best_f1:
                    best_f1 = test_f1
                    no_improve = 0
                    best_epoch = epoch
                else:
                    no_improve += 1

                print(f"Trial {trial.number} | Epoch {epoch:3d} | "
                      f"Val F1: {test_f1:.4f} | Best F1: {best_f1:.4f} @ Epoch {best_epoch}")

                if no_improve >= patience:
                    print(f"Early stopping at epoch {epoch}")
                    break

            trial.report(test_f1, epoch)
            if trial.should_prune():
                print(f"Pruned trial {trial.number} at epoch {epoch}")
                raise optuna.exceptions.TrialPruned()
            
    print(f"=== Completed Trial {trial.number} | Best F1: {best_f1:.4f} ===")
    return best_f1

In [173]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

print(f'best params: {study.best_params}')
print(f'best f1: {study.best_value}')

[I 2025-02-16 13:44:26,171] A new study created in memory with name: no-name-509ccd0c-7bc9-4c0c-8114-f9b9180c2182



=== Starting Trial 0 ===
Parameters: {'hidden_units': 256, 'dropout_rate': 0.1534555981183385, 'lr': 0.00019855387735360697, 'batch_size': 128}
Trial 0 | Epoch   0 | Val F1: 0.8717 | Best F1: 0.8717 @ Epoch 0
Trial 0 | Epoch   5 | Val F1: 0.9038 | Best F1: 0.9038 @ Epoch 5
Trial 0 | Epoch  10 | Val F1: 0.9051 | Best F1: 0.9051 @ Epoch 10
Trial 0 | Epoch  15 | Val F1: 0.9038 | Best F1: 0.9051 @ Epoch 10
Trial 0 | Epoch  20 | Val F1: 0.9045 | Best F1: 0.9051 @ Epoch 10
Trial 0 | Epoch  25 | Val F1: 0.9012 | Best F1: 0.9051 @ Epoch 10
Trial 0 | Epoch  30 | Val F1: 0.9059 | Best F1: 0.9059 @ Epoch 30
Trial 0 | Epoch  35 | Val F1: 0.9031 | Best F1: 0.9059 @ Epoch 30
Trial 0 | Epoch  40 | Val F1: 0.9054 | Best F1: 0.9059 @ Epoch 30
Trial 0 | Epoch  45 | Val F1: 0.9049 | Best F1: 0.9059 @ Epoch 30
Trial 0 | Epoch  50 | Val F1: 0.9027 | Best F1: 0.9059 @ Epoch 30
Trial 0 | Epoch  55 | Val F1: 0.9061 | Best F1: 0.9061 @ Epoch 55
Trial 0 | Epoch  60 | Val F1: 0.9048 | Best F1: 0.9061 @ Epoch 55

[I 2025-02-16 13:46:20,616] Trial 0 finished with value: 0.9061100510144476 and parameters: {'hidden_units': 256, 'dropout_rate': 0.1534555981183385, 'lr': 0.00019855387735360697, 'batch_size': 128}. Best is trial 0 with value: 0.9061100510144476.


Trial 0 | Epoch  80 | Val F1: 0.9050 | Best F1: 0.9061 @ Epoch 55
Early stopping at epoch 80
=== Completed Trial 0 | Best F1: 0.9061 ===

=== Starting Trial 1 ===
Parameters: {'hidden_units': 2048, 'dropout_rate': 0.41354191811329744, 'lr': 0.0017761817159793988, 'batch_size': 512}
Trial 1 | Epoch   0 | Val F1: 0.8875 | Best F1: 0.8875 @ Epoch 0
Trial 1 | Epoch   5 | Val F1: 0.9051 | Best F1: 0.9051 @ Epoch 5
Trial 1 | Epoch  10 | Val F1: 0.9060 | Best F1: 0.9060 @ Epoch 10
Trial 1 | Epoch  15 | Val F1: 0.9060 | Best F1: 0.9060 @ Epoch 15
Trial 1 | Epoch  20 | Val F1: 0.9022 | Best F1: 0.9060 @ Epoch 15
Trial 1 | Epoch  25 | Val F1: 0.9056 | Best F1: 0.9060 @ Epoch 15
Trial 1 | Epoch  30 | Val F1: 0.9084 | Best F1: 0.9084 @ Epoch 30
Trial 1 | Epoch  35 | Val F1: 0.9073 | Best F1: 0.9084 @ Epoch 30
Trial 1 | Epoch  40 | Val F1: 0.9073 | Best F1: 0.9084 @ Epoch 30
Trial 1 | Epoch  45 | Val F1: 0.9076 | Best F1: 0.9084 @ Epoch 30
Trial 1 | Epoch  50 | Val F1: 0.9030 | Best F1: 0.9084 @ Ep

[I 2025-02-16 13:51:10,064] Trial 1 finished with value: 0.9083761482173439 and parameters: {'hidden_units': 2048, 'dropout_rate': 0.41354191811329744, 'lr': 0.0017761817159793988, 'batch_size': 512}. Best is trial 1 with value: 0.9083761482173439.


Trial 1 | Epoch  55 | Val F1: 0.9042 | Best F1: 0.9084 @ Epoch 30
Early stopping at epoch 55
=== Completed Trial 1 | Best F1: 0.9084 ===

=== Starting Trial 2 ===
Parameters: {'hidden_units': 256, 'dropout_rate': 0.4054349899798625, 'lr': 0.004748286329707304, 'batch_size': 64}
Trial 2 | Epoch   0 | Val F1: 0.9028 | Best F1: 0.9028 @ Epoch 0
Trial 2 | Epoch   5 | Val F1: 0.9046 | Best F1: 0.9046 @ Epoch 5
Trial 2 | Epoch  10 | Val F1: 0.9048 | Best F1: 0.9048 @ Epoch 10
Trial 2 | Epoch  15 | Val F1: 0.9016 | Best F1: 0.9048 @ Epoch 10
Trial 2 | Epoch  20 | Val F1: 0.9048 | Best F1: 0.9048 @ Epoch 20
Trial 2 | Epoch  25 | Val F1: 0.9053 | Best F1: 0.9053 @ Epoch 25
Trial 2 | Epoch  30 | Val F1: 0.9066 | Best F1: 0.9066 @ Epoch 30
Trial 2 | Epoch  35 | Val F1: 0.9029 | Best F1: 0.9066 @ Epoch 30
Trial 2 | Epoch  40 | Val F1: 0.9071 | Best F1: 0.9071 @ Epoch 40
Trial 2 | Epoch  45 | Val F1: 0.9067 | Best F1: 0.9071 @ Epoch 40
Trial 2 | Epoch  50 | Val F1: 0.9067 | Best F1: 0.9071 @ Epoch 

[I 2025-02-16 13:55:04,899] Trial 2 finished with value: 0.9087185036327099 and parameters: {'hidden_units': 256, 'dropout_rate': 0.4054349899798625, 'lr': 0.004748286329707304, 'batch_size': 64}. Best is trial 2 with value: 0.9087185036327099.


Trial 2 | Epoch  95 | Val F1: 0.9048 | Best F1: 0.9087 @ Epoch 70
Early stopping at epoch 95
=== Completed Trial 2 | Best F1: 0.9087 ===

=== Starting Trial 3 ===
Parameters: {'hidden_units': 512, 'dropout_rate': 0.29459150625105446, 'lr': 0.0007470485226158362, 'batch_size': 256}
Trial 3 | Epoch   0 | Val F1: 0.9010 | Best F1: 0.9010 @ Epoch 0
Trial 3 | Epoch   5 | Val F1: 0.9044 | Best F1: 0.9044 @ Epoch 5
Trial 3 | Epoch  10 | Val F1: 0.9044 | Best F1: 0.9044 @ Epoch 5
