In [1]:
import time
import numpy as np
import pandas as pd
import sklearn.preprocessing
import sklearn.ensemble
import sklearn.model_selection
import joblib

### Загрузка данных

In [2]:
data = pd.read_csv('data_2014.csv')

### Обработка пропущенных значений признака `Victim Age`

In [3]:
data['Victim Age'] = data.apply(lambda row: -1 if row.is_victim_age_missing else row['Victim Age'], axis=1)

In [4]:
categorical_features = ['Agency Type', 'State', 'City', 'Crime Type', 'Victim Sex','Victim Race',
       'Victim Ethnicity', 'Weapon', 'is_victim_age_missing']
other_features = ['Victim Age', 'Victim Count', 'Perpetrator Count']
target = ['Crime Solved']

### Кодирование категориальных признаков

In [5]:
encoder = sklearn.preprocessing.OrdinalEncoder()
X = encoder.fit_transform(data[categorical_features].values)

In [6]:
X = np.hstack([X, data[other_features].values]).astype(np.float32)
y = (data['Crime Solved'] == 'No').values.astype(np.float32)

np.save('X', X)
np.save('y', y)

### Подбор гиперпараметров

In [None]:
def print_results(grid_search_cv, metrics=['f1', 'precision', 'recall']):
    print('=' * 80)
    print(grid_search_cv.best_params_)
    print('=' * 80)
    index = grid_search_cv.best_index_
    results = grid_search_cv.cv_results_
    for metric in metrics:
        print(metric, '=', results[f'mean_test_{metric}'][index], '±', results[f'std_test_{metric}'][index])


param_grid = {
    'bootstrap': [True, False],
    'max_depth': [7, 8, 9, 10, 11, 12, 13, 14, 15],
    'max_features': ['sqrt', 'log2', None],
    'min_samples_leaf': [1, 3, 6, 9, 12],
    'min_samples_split': [2, 3, 6, 9, 12],
    'n_estimators': [10, 50, 100, 200, 400, 800]
}

rfc = sklearn.ensemble.RandomForestClassifier(n_jobs=-1, random_state=0)
grid_search = sklearn.model_selection.GridSearchCV(
    estimator=rfc,
    param_grid=param_grid,
    scoring=['f1', 'precision', 'recall'],
    n_jobs=-1,
    cv=5,
    refit='f1',
    verbose=3
)
grid_search.fit(X, y)

Fitting 5 folds for each of 8100 candidates, totalling 40500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  96 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 256 tasks      | elapsed:   27.3s
[Parallel(n_jobs=-1)]: Done 480 tasks      | elapsed:   50.7s
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1120 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 1536 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 2016 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 2560 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 3168 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done 3840 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 4576 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-1)]: Done 5376 tasks      | elapsed: 11.7min
[Parallel(n_jobs=-1)]: Done 6240 tasks      | elapsed: 13.7min
[Parallel(n_jobs=-1)]: Done 7168 tasks      | elapsed: 16.3min
[Parallel(n_jobs=-1)]: Done 8160 tasks      |

GridSearchCV(cv=5, estimator=RandomForestClassifier(n_jobs=-1, random_state=0),
             n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'max_depth': [7, 8, 9, 10, 11, 12, 13, 14, 15],
                         'max_features': ['sqrt', 'log2', None],
                         'min_samples_leaf': [1, 3, 6, 9, 12],
                         'min_samples_split': [2, 3, 6, 9, 12],
                         'n_estimators': [10, 50, 100, 200, 400, 800]},
             refit='f1', scoring=['f1', 'precision', 'recall'], verbose=3)

In [None]:
print_results(grid_search)

{'bootstrap': False, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 3, 'n_estimators': 800}
f1 = 0.7454823650838137 ± 0.006187559186625962
precision = 0.7174443993645698 ± 0.00918098288375578
recall = 0.7762866937041328 ± 0.01873953277528437


### Быстродействие. Время обучения модели с оптимальными гиперпараметрами.

In [7]:
rfc = sklearn.ensemble.RandomForestClassifier(
    bootstrap=False,
    max_depth=10,
    max_features='sqrt',
    min_samples_leaf=1,
    min_samples_split=3,
    n_estimators=800,
    n_jobs=-1,
    random_state=0
)

In [None]:
N_RUNS = 31
ticks = []
for i in range(N_RUNS):
    start = time.time()
    rfc.fit(X, y)
    stop = time.time()
    ticks.append((start, stop))

In [None]:
fit_times = np.array([stop - start for start, stop in ticks])[1:]
print('Mean fit time:', fit_times.mean())
print('Standard deviation:', fit_times.std(ddof=1))

Mean fit time: 4.3143184900283815
Standard deviation: 0.025189985097125632


In [8]:
joblib.dump(rfc, 'random_forest_classifier.joblib')    

['random_forest_classifier.joblib']

### Быстродействие. Время вычисления предсказания модели с оптимальными гиперпараметрами для одного объекта.  

In [None]:
ticks = []
for i, x in enumerate(X):
    x = x[np.newaxis, :]
    start = time.time()
    rfc.predict(x)
    stop = time.time()
    ticks.append((start, stop))

In [None]:
predict_times = np.array([stop - start for start, stop in ticks])[1:]
print('Mean predict time:', predict_times.mean())
print('Standard deviation:', predict_times.std(ddof=1))

Mean predict time: 0.20366137575813298
Standard deviation: 0.011623671027443725


### Важности признаков модели с оптимальными гиперпараметрами.

In [36]:
sorted(list(zip(categorical_features + other_features, rfc.feature_importances_)), key=lambda pair: pair[1])

[('is_victim_age_missing', 0.01262293551878417),
 ('Crime Type', 0.017220495698486875),
 ('Victim Count', 0.023767516895250307),
 ('Victim Ethnicity', 0.04259880078933585),
 ('Agency Type', 0.04910306334559364),
 ('Victim Age', 0.07720096225661899),
 ('City', 0.08409714843119104),
 ('State', 0.09994500391575852),
 ('Victim Sex', 0.10229358032343253),
 ('Victim Race', 0.11187816035438954),
 ('Weapon', 0.13044784958199024),
 ('Perpetrator Count', 0.24882448288916836)]

In [None]:
! cat /proc/cpuinfo

processor	: 0
vendor_id	: GenuineIntel
cpu family	: 6
model		: 63
model name	: Intel(R) Xeon(R) CPU @ 2.30GHz
stepping	: 0
microcode	: 0x1
cpu MHz		: 2300.000
cache size	: 46080 KB
physical id	: 0
siblings	: 2
core id		: 0
cpu cores	: 1
apicid		: 0
initial apicid	: 0
fpu		: yes
fpu_exception	: yes
cpuid level	: 13
wp		: yes
flags		: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid xsaveopt arat md_clear arch_capabilities
bugs		: cpu_meltdown spectre_v1 spectre_v2 spec_store_bypass l1tf mds swapgs
bogomips	: 4600.00
clflush size	: 64
cache_alignment	: 64
address sizes	: 46 bits physical, 48 bits virtual
power management:

processor	: