In [1]:
%load_ext watermark
%watermark

Last updated: 2022-08-27T17:19:58.946852+03:00

Python implementation: CPython
Python version       : 3.10.4
IPython version      : 8.4.0

Compiler    : GCC 7.5.0
OS          : Linux
Release     : 5.15.0-46-generic
Machine     : x86_64
Processor   : x86_64
CPU cores   : 8
Architecture: 64bit



In [2]:
import time
notebookstart= time.time()

In [3]:
import os
from typing import List, Tuple, Optional

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from catboost import CatBoostRegressor
from catboost import Pool, cv

  from pandas import MultiIndex, Int64Index


In [4]:
%watermark --iversions

seaborn   : 0.11.2
numpy     : 1.23.2
pandas    : 1.4.3
matplotlib: 3.5.3



In [5]:
from catboost import __version__ as cb_version
print(f'cb_version: {cb_version}')

cb_version: 1.0.6


Блок для воспроизводимости результатов

In [6]:
# seed the RNG for all devices (both CPU and CUDA)
#torch.manual_seed(1984)

#Disabling the benchmarking feature causes cuDNN to deterministically select an algorithm, 
#possibly at the cost of reduced performance.
#torch.backends.cudnn.benchmark = False

# for custom operators,
import random
random.seed(5986721)

# 
np.random.seed(62185)

#sklearn take seed from a line abowe

CB_RANDOMSEED  = 309487
XGB_RANDOMSEED = 56
LGB_RANDOMSEED = 874256

In [7]:
DIR_DATA = os.path.join(os.getcwd(), 'data')
DIR_SUBM = os.path.join(os.getcwd(), 'subm')
DIR_SUBM_TRAIN = os.path.join(os.getcwd(), 'subm', 'train')
DIR_DATA_TRAIN = os.path.join(DIR_DATA, 'train')
DIR_DATA_TEST  = os.path.join(DIR_DATA, 'test')

In [8]:
def plot_feature_importance(importance, names: List[str], model_type:str, imp_number: Optional[int] = 30) -> None:
    
    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    
    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)
    
    #Define size of bar plot
    plt.figure(figsize=(10,8))
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'][:imp_number], y=fi_df['feature_names'][:imp_number])
    #Add chart labels
    plt.title(model_type + ' FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

NameError: name 'List' is not defined

# Загрузка данных

In [None]:
train_df = pd.read_csv(os.path.join(DIR_DATA, 'train_upd.csv'))
test_df  = pd.read_csv(os.path.join(DIR_DATA, 'test_upd.csv'))
train_df.shape, test_df.shape

In [None]:
train_df.head()

Исключаем из обучающей выборки сильно размытые кадры из подземки.   
На них размеры определяются явно некорректно,что помешает при обучении

In [None]:
motion_blur_train = set(['img_2709.heic', 'img_2733.heic', 'img_2734.heic'])    # 'img_2734.heic' возможно рабочий 
print(train_df.shape)

for el in train_df.index:
    if train_df.loc[el, 'image_name'] in motion_blur_train:
        train_df.drop(el, inplace = True)

print(train_df.shape)

# Обучаем модель

cv на 3 фолда.   
по ним выбираем лучшее количество итераций по RMSE.

In [None]:
%%time
params = {"iterations": 3500,
          "loss_function": 'RMSE',
          #'eval_metric': 'R2',
         }

#features = ['log_x_min', 'log_y_min', 'log_x_max', 'log_y_max', 'conf', 'log_h', 'log_w']
features = ['log_plate_h', 'log_plate_w']

train = Pool(data = train_df[features],
             label = train_df[['distance']],
            )

scores = cv(train, params,
            fold_count = 3,
            verbose = False,
            plot = True,
           )

выбираем оптимальное количество итераций

In [None]:
niter = scores['test-RMSE-mean'].argmin() + 13
scores['test-RMSE-mean'].min(), scores['test-RMSE-mean'].argmin(), niter

Обучем на этом количестве итераций модель на всей обучающей выборке

In [None]:
%%time

model_cb = CatBoostRegressor(iterations = niter, verbose = 100)
# Fit model
model_cb.fit(train_df[features], train_df[['distance']].values)

Посмотрим на выжность признаков

In [None]:
#plot_feature_importance(model_cb.get_feature_importance(), train_df[features].get_feature_names(), 'CATBOOST')
plot_feature_importance(model_cb.get_feature_importance(), features, 'CATBOOST')

Предскажем на трейне для просмотра ошибок в 5step

In [None]:
pred_train = model_cb.predict(train_df[features])
train_df['pred'] = pred_train
train_df.to_csv(os.path.join(DIR_SUBM_TRAIN, 'train_with_pred.csv'), index = False)

Предсказываем для теста.    
Оформляем сабмит.

In [None]:
preds = model_cb.predict(test_df[features])
test_df['distance'] = preds

sample_solution_df = test_df[['image_name', 'distance']]

In [None]:
test_img_names = set(os.listdir(DIR_DATA_TEST))

In [None]:
lost_test_items = []

for file_name in test_img_names - set(sample_solution_df['image_name'].values):
    lost_test_items.append([file_name, 0])
    
lost_test_items_df = pd.DataFrame(lost_test_items, columns=['image_name', 'distance'])
sample_solution_df = pd.concat([sample_solution_df, lost_test_items_df])

#sample_solution_df.to_csv(os.path.join(DIR_SUBM, '17_palte_minhw_cntr_niter_nomb.csv'), sep=';', index=False)
sample_solution_df.to_csv(os.path.join(DIR_SUBM, '18_repeat.csv'), sep=';', index=False)