In [77]:
import pandas as pd
import random
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from sklearn.model_selection import cross_val_score

In [78]:
data = pd.read_csv('fines.csv')

In [79]:
data.head()

Unnamed: 0,license_categories,gender,experience,birthday,count_fines,brand,number
0,4,Жен.,20,1979-10-13,48,Audi S4/S4 Avant,BA403 06
1,2,Муж.,1,1987-04-02,14,Peugeot 307 SW,KО9648 154
2,1,Жен.,3,1996-05-24,8,Audi A4 Cabriolet,009D439 32
3,3,Муж.,31,1972-08-10,15,Audi Q5,Т104РM 67
4,2,Муж.,25,1966-12-24,24,Audi A4 Avant,1680EM 91


In [80]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 7 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   license_categories  200000 non-null  int64 
 1   gender              200000 non-null  object
 2   experience          200000 non-null  int64 
 3   birthday            200000 non-null  object
 4   count_fines         200000 non-null  int64 
 5   brand               200000 non-null  object
 6   number              200000 non-null  object
dtypes: int64(3), object(4)
memory usage: 10.7+ MB


In [81]:
# размерность датасета
data.shape

(200000, 7)

In [82]:
# загрубим значение даты рождения
data['birthday_year'] = data["birthday"].apply(lambda e:  round(int(e[0:4]), -1))

In [83]:
data['region'] = data["number"].apply(lambda e: e.split(' ')[1])

In [84]:
# закодируем категориальные признаки
category_cols = ['gender', 'brand', 'region']
dicts = []
for col in category_cols:
  keys, values = pd.factorize(data[col].unique())
  d = dict(zip(values, keys))
  data[col] = data[col].replace(d)
  dicts.append(d)

То что мы будем предсказывать - примерный диапазон штрафов

In [85]:
data['is_high_risk'] = data["count_fines"].apply(lambda e:   e > 12)

Удаляем лишние признаки

In [86]:
data = data.drop(['birthday', 'count_fines'], axis=1)

In [88]:
data = data.drop(['number'], axis=1)

In [89]:
data.head()

Unnamed: 0,license_categories,gender,experience,brand,birthday_year,region,is_high_risk
0,4,0,20,0,1980,0,True
1,2,1,1,1,1990,1,True
2,1,0,3,2,2000,2,False
3,3,1,31,3,1970,3,True
4,2,1,25,4,1970,4,True


In [90]:
data["is_high_risk"].value_counts()

is_high_risk
False    108230
True      91770
Name: count, dtype: int64

Видим, что распределение по классам примерно равномерное

Разделение на обучающую и тестовую выборки

In [91]:
x = data.drop('is_high_risk', axis=1)
y = data['is_high_risk']
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75)

In [92]:
# размерности выборок
print(f"X_train shape: {x_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {x_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (150000, 6)
y_train shape: (150000,)
X_test shape: (50000, 6)
y_test shape: (50000,)


Нормализация входных данных

In [93]:
cols = list(x_train.columns)
scaler = StandardScaler()
scaler.fit(x[cols])
scaled = scaler.transform(x_train[cols])
x_train = pd.DataFrame(scaled, columns=cols)
scaled = scaler.transform(x_test[cols])
x_test = pd.DataFrame(scaled, columns=cols)

Обучение модели

In [94]:
model = RandomForestClassifier()
model.fit(x_train, y_train)
preds = model.predict(x_test)

In [95]:
print(f'Accuracy: {accuracy_score(y_test, preds)}')
print(f'Precision: {precision_score(y_test, preds)}')
print(f'Recall: {recall_score(y_test, preds)}')
print(f'F1-measure: {f1_score(y_test, preds)}')
print(classification_report(y_test, preds))

Accuracy: 0.71954
Precision: 0.6759995254478586
Recall: 0.7456813819577736
F1-measure: 0.7091327705295472
              precision    recall  f1-score   support

       False       0.76      0.70      0.73     27076
        True       0.68      0.75      0.71     22924

    accuracy                           0.72     50000
   macro avg       0.72      0.72      0.72     50000
weighted avg       0.72      0.72      0.72     50000



Отбор признаков методом Forward Selection (Форвард выборка) : Начинает с пустого набора признаков и на каждом шаге добавляет признак, который дает наибольшее улучшение качества модели. Процесс продолжается до тех пор, пока не будет достигнут заранее определенный уровень качества или не будет достигнут предел количества признаков.

In [121]:
from sklearn.feature_selection import SelectPercentile, f_classif

# Выбираем 70% лучших признаков
percentile = 50
selector = SelectPercentile(score_func=f_classif, percentile=percentile)
selector.fit(x_train, y_train)

In [122]:
filtered_model = RandomForestClassifier()
# обучение
filtered_model.fit(selector.transform(x_train), y_train)

In [123]:
filtered_preds = filtered_model.predict(selector.transform(x_test))

In [124]:
print(f'Accuracy: {accuracy_score(y_test, filtered_preds)}')
print(f'Precision: {precision_score(y_test, filtered_preds)}')
print(f'Recall: {recall_score(y_test, filtered_preds)}')
print(f'F1-measure: {f1_score(y_test, filtered_preds)}')
print(classification_report(y_test, filtered_preds))

Accuracy: 0.7322
Precision: 0.6842674913026672
Recall: 0.7722038038736695
F1-measure: 0.7255810140591056
              precision    recall  f1-score   support

       False       0.78      0.70      0.74     27076
        True       0.68      0.77      0.73     22924

    accuracy                           0.73     50000
   macro avg       0.73      0.74      0.73     50000
weighted avg       0.74      0.73      0.73     50000



Видим, что мы смогли достигнуть такой же точности, уменьшим количество признаков вдвое

In [126]:
selector.transform(x_train).shape[1]

3

Подбираем наилучшие гиперпараметры

In [127]:
from sklearn.model_selection import RandomizedSearchCV
grid = {
    'n_estimators': list(range(10, 100, 5)),
    'max_features': list(range(1,selector.transform(x_train).shape[1] + 1)),
    'min_samples_leaf': list(range(1, 15))
}

In [130]:
filtered_model = RandomForestClassifier()
clf = RandomizedSearchCV(filtered_model, grid, random_state=42)
search = clf.fit(selector.transform(x_train), y_train)

In [131]:
search.best_params_

{'n_estimators': 55, 'min_samples_leaf': 6, 'max_features': 3}

Обучаем модель с этими гиперпараметрами

In [134]:
model = RandomForestClassifier(n_estimators=55, min_samples_leaf=6, max_features=3)
model.fit(selector.transform(x_train), y_train)
preds = model.predict(selector.transform(x_test))

In [135]:
print(f'Accuracy: {accuracy_score(y_test, preds)}')
print(f'Precision: {precision_score(y_test, preds)}')
print(f'Recall: {recall_score(y_test, preds)}')
print(f'F1-measure: {f1_score(y_test, preds)}')
print(classification_report(y_test, preds))

Accuracy: 0.73996
Precision: 0.6914261460101867
Recall: 0.7816698656429942
F1-measure: 0.7337837837837837
              precision    recall  f1-score   support

       False       0.79      0.70      0.75     27076
        True       0.69      0.78      0.73     22924

    accuracy                           0.74     50000
   macro avg       0.74      0.74      0.74     50000
weighted avg       0.75      0.74      0.74     50000



Результаты улучшились!