In [44]:
import numpy as np

from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import r2_score, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from data_reader import read_data
from data_preprocessing import preprocess_data
from feature_extractor import extract_features

%matplotlib inline

In [33]:
import warnings
warnings.filterwarnings('ignore')

In [34]:
data = read_data()
data = preprocess_data(data)
data.head()

Unnamed: 0,id,Title,Uri,PublicationDateTimeUTC,ProcedureDisplayName,Amount,RubPrice,CurrencyCode,StatusDisplayName,StatusCode,SuppliersCount,IsWinner,Inn,Kpp,Name,Ogrn,ResultClass,Ogrn1
0,100600000117000001,"Закупки товаров, работ, услуг в соответствии с...",https://zakupki.kontur.ru/0100600000117000001,2017-04-13,Закупка у единственного поставщика (подрядчика...,1310000.0,1310000.0,RUB,Размещение завершено,2.0,0.0,0.0,4720008346,,МЕСТНАЯ АДМИНИСТРАЦИЯ МУНИЦИПАЛЬНОГО ОБРАЗОВАН...,1024702184451,0.0,47
1,100600000117000002,"Закупки товаров, работ, услуг в соответствии с...",https://zakupki.kontur.ru/0100600000117000002,2017-04-13,Закупка у единственного поставщика (подрядчика...,249000.0,249000.0,RUB,Размещение завершено,2.0,0.0,0.0,4720008346,,МЕСТНАЯ АДМИНИСТРАЦИЯ МУНИЦИПАЛЬНОГО ОБРАЗОВАН...,1024702184451,0.0,47
2,100600000117000003,"Закупки товаров, работ, услуг в соответствии с...",https://zakupki.kontur.ru/0100600000117000003,2017-07-06,Закупка у единственного поставщика (подрядчика...,250000.0,250000.0,RUB,Размещение завершено,2.0,0.0,0.0,4720008346,,МЕСТНАЯ АДМИНИСТРАЦИЯ МУНИЦИПАЛЬНОГО ОБРАЗОВАН...,1024702184451,0.0,47
3,100600000117000004,Выполнение работ по ремонту автомобильной доро...,https://zakupki.kontur.ru/0100600000117000004,2017-07-06,Запрос котировок,400300.0,400300.0,RUB,Размещение завершено,2.0,2.0,1.0,4720008346,,МЕСТНАЯ АДМИНИСТРАЦИЯ МУНИЦИПАЛЬНОГО ОБРАЗОВАН...,1024702184451,1.0,47
4,100600000117000005,Приобретение и установка спортивного оборудования,https://zakupki.kontur.ru/0100600000117000005,2017-07-10,Запрос котировок,451000.0,451000.0,RUB,Размещение завершено,2.0,1.0,0.0,4720008346,,МЕСТНАЯ АДМИНИСТРАЦИЯ МУНИЦИПАЛЬНОГО ОБРАЗОВАН...,1024702184451,0.0,47


In [42]:
g = data.groupby('SuppliersCount')
data_balanced = g.apply(lambda x: x.head(900)).sample(frac=1).reset_index(drop=True)
data_balanced

Unnamed: 0,id,Title,Uri,PublicationDateTimeUTC,ProcedureDisplayName,Amount,RubPrice,CurrencyCode,StatusDisplayName,StatusCode,SuppliersCount,IsWinner,Inn,Kpp,Name,Ogrn,ResultClass,Ogrn1
0,0101100000917000013,Оказание услуг по ремонту (с использованием не...,https://zakupki.kontur.ru/0101100000917000013,2017-02-27,Электронный аукцион,1000000.00,1000000.00,RUB,Размещение завершено,2.0,1.0,0.0,0275006462,,МИНИСТЕРСТВО ВНУТРЕННИХ ДЕЛ ПО РЕСПУБЛИКЕ БАШК...,1020202771357,0.0,2
1,0101200009516002204,Электронный аукцион [g012447241/2922g] на прав...,https://zakupki.kontur.ru/0101200009516002204,2016-06-15,Электронный аукцион,880000.00,880000.00,RUB,Размещение завершено,2.0,8.0,1.0,0278176470,,ГОСУДАРСТВЕННОЕ КАЗЕННОЕ УЧРЕЖДЕНИЕ УПРАВЛЕНИЕ...,1110280008475,1.0,2
2,0101200009517000720,Закупка лекарственного препарата- Эноксапарин ...,https://zakupki.kontur.ru/0101200009517000720,2017-04-26,Электронный аукцион,1250856.00,1250856.00,RUB,Размещение завершено,2.0,4.0,1.0,0276008991,,ГОСУДАРСТВЕННОЕ БЮДЖЕТНОЕ УЧРЕЖДЕНИЕ ЗДРАВООХР...,1020202867178,1.0,2
3,0101200002317000421,Капитальный ремонт кровли МБОУ СОШ №11 по адре...,https://zakupki.kontur.ru/0101200002317000421,2017-06-15,Электронный аукцион,560000.00,560000.00,RUB,Размещение завершено,2.0,6.0,1.0,0261009745,,МУНИЦИПАЛЬНОЕ БЮДЖЕТНОЕ ОБЩЕОБРАЗОВАТЕЛЬНОЕ УЧ...,1020201773745,1.0,2
4,0101200009516004030,Электронный аукцион [g000447241/5272g] на прав...,https://zakupki.kontur.ru/0101200009516004030,2016-10-06,Электронный аукцион,742160.00,742160.00,RUB,Размещение завершено,2.0,5.0,1.0,0261014223,,ГОСУДАРСТВЕННОЕ БЮДЖЕТНОЕ УЧРЕЖДЕНИЕ РЕСПУБЛИК...,1060261000392,1.0,2
5,0102200001617002133,Выполнение работ по замене окон,https://zakupki.kontur.ru/0102200001617002133,2017-05-22,Электронный аукцион,1000002.80,1000002.80,RUB,Размещение завершено,2.0,7.0,1.0,0305000196,,ГОСУДАРСТВЕННОЕ БЮДЖЕТНОЕ ОБЩЕОБРАЗОВАТЕЛЬНОЕ ...,1020300567671,1.0,3
6,0109200002417001445,закупка лекарственных препаратов,https://zakupki.kontur.ru/0109200002417001445,2017-06-28,Электронный аукцион,252337.80,252337.80,RUB,Размещение завершено,2.0,8.0,1.0,1325031645,,ГОСУДАРСТВЕННОЕ БЮДЖЕТНОЕ УЧРЕЖДЕНИЕ ЗДРАВООХР...,1021300980073,1.0,13
7,0101200008017000002,оказание услуг по оценке рыночной стоимости зе...,https://zakupki.kontur.ru/0101200008017000002,2017-03-07,Запрос котировок,14500.00,14500.00,RUB,Размещение завершено,2.0,7.0,1.0,0274045532,,МИНИСТЕРСТВО ЗЕМЕЛЬНЫХ И ИМУЩЕСТВЕННЫХ ОТНОШЕН...,1020202552920,1.0,2
8,0111300005116000435,Право заключения муниципального контракта на о...,https://zakupki.kontur.ru/0111300005116000435,2016-03-25,Электронный аукцион,586800.00,586800.00,RUB,Размещение завершено,2.0,13.0,1.0,1651028784,,МУНИЦИПАЛЬНОЕ БЮДЖЕТНОЕ ОБЩЕОБРАЗОВАТЕЛЬНОЕ УЧ...,1021602504263,1.0,16
9,0101100007016000219,Оказание услуг по вывозу снега,https://zakupki.kontur.ru/0101100007016000219,2016-09-26,Запрос котировок,215877.90,215877.90,RUB,Размещение завершено,2.0,2.0,0.0,0274075880,,Управление Федеральной службы безопасности Рос...,1030203893697,0.0,2


In [36]:
data_balanced.groupby('SuppliersCount').size()

SuppliersCount
0.0     900
1.0     900
2.0     900
3.0     900
4.0     900
5.0     900
6.0     900
7.0     900
8.0     900
9.0     900
10.0    448
11.0    148
12.0     73
13.0     58
14.0     75
15.0     12
16.0     27
17.0      3
18.0      4
19.0      2
20.0      1
dtype: int64

In [37]:
features, _ = extract_features(data_balanced)
features.fillna(0, inplace=True)
features.drop('ResultClass', axis=1, inplace=True)
features.head()

oh CurrencyCode
oh ProcedureDisplayName
oh Ogrn
ng Title
ng ProcedureDisplayName
ng OrgName


Unnamed: 0,RubPrice,SuppliersCount,Amount,OneHot_CurrencyCode_RUB,"OneHot_ProcedureDisplayName_Закупка у единственного поставщика (подрядчика, исполнителя)",OneHot_ProcedureDisplayName_Запрос котировок,OneHot_ProcedureDisplayName_Запрос предложений,OneHot_ProcedureDisplayName_Конкурс с ограниченным участием,OneHot_ProcedureDisplayName_Открытый конкурс,OneHot_ProcedureDisplayName_Повторный конкурс,...,Ngrams_Name_290,Ngrams_Name_291,Ngrams_Name_292,Ngrams_Name_293,Ngrams_Name_294,Ngrams_Name_295,Ngrams_Name_296,Ngrams_Name_297,Ngrams_Name_298,Ngrams_Name_299
0,1420000.0,2.0,1420000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,85550.0,6.0,85550.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2896003.82,5.0,2896003.82,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9949.67,0.0,9949.67,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,24745.12,1.0,24745.12,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
X = features.drop('SuppliersCount', axis=1)
y = features.SuppliersCount
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123456)

In [39]:
np.mean(cross_val_score(XGBRegressor(), X_train, y_train, cv=5, scoring=make_scorer(r2_score)))

0.7441160261300808

In [43]:
cat = CatBoostRegressor(verbose=0)
np.mean(cross_val_score(cat, X_train, y_train, cv=5, scoring=make_scorer(r2_score)))

0.7670811281064502

In [47]:
param_grid = {
    'min_child_weight':[2, 3, 4, 5], 
    'gamma':[i / 10.0 for i in range(1, 4)],  
    'subsample':[i / 10.0 for i in range(9, 11)],
    'colsample_bytree':[i / 10.0 for i in range(9, 11)], 
    'max_depth': [4, 5, 6, 7]
}

xgb_reg= XGBRegressor(n_estimators=100, random_state=42) 

grid = GridSearchCV(xgb_reg, param_grid, cv=5, verbose=10, scoring="r2", n_jobs=-1)
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_score_)

Fitting 5 folds for each of 192 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   28.0s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   30.6s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   58.6s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 157 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 197 tasks      | elapsed: 1

{'colsample_bytree': 1.0, 'gamma': 0.2, 'max_depth': 7, 'min_child_weight': 3, 'subsample': 0.9}
0.7841377786568853
