In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.model_selection import cross_val_score

from data_reader import read_data
from data_preprocessing import preprocess_data
from feature_extractor import extract_features

%matplotlib inline

In [2]:
data = read_data()
data = preprocess_data(data, limit=10000)
data.head()

Unnamed: 0,id,Title,Uri,PublicationDateTimeUTC,ProcedureDisplayName,Amount,RubPrice,CurrencyCode,StatusDisplayName,StatusCode,SuppliersCount,IsWinner,Inn,Kpp,Name,Ogrn,ResultClass,Ogrn1
0,100600000117000008,услуги по капитальному ремонту,https://zakupki.kontur.ru/0100600000117000008,2017-07-27,Закупка у единственного поставщика (подрядчика...,352000.0,352000.0,RUB,Размещение отменено,3.0,0.0,0.0,4720008346,,МЕСТНАЯ АДМИНИСТРАЦИЯ МУНИЦИПАЛЬНОГО ОБРАЗОВАН...,1024702184451,2.0,47
1,101100000116000162,Масло сладко-сливочное несоленое Крестьянское,https://zakupki.kontur.ru/0101100000116000162,2016-10-24,Электронный аукцион,3855260.0,3855260.0,RUB,Размещение отменено,3.0,0.0,0.0,275006455,,УПРАВЛЕНИЕ ФЕДЕРАЛЬНОЙ СЛУЖБЫ ИСПОЛНЕНИЯ НАКАЗ...,1020202776714,2.0,2
2,101100000117000153,Индивидуальный рацион питания для повседневной...,https://zakupki.kontur.ru/0101100000117000153,2017-09-25,Электронный аукцион,3600000.0,3600000.0,RUB,Размещение отменено,3.0,0.0,0.0,275006455,,УПРАВЛЕНИЕ ФЕДЕРАЛЬНОЙ СЛУЖБЫ ИСПОЛНЕНИЯ НАКАЗ...,1020202776714,2.0,2
3,101100000117000154,Консервы мясные,https://zakupki.kontur.ru/0101100000117000154,2017-09-25,Электронный аукцион,15400000.0,15400000.0,RUB,Размещение отменено,3.0,0.0,0.0,275006455,,УПРАВЛЕНИЕ ФЕДЕРАЛЬНОЙ СЛУЖБЫ ИСПОЛНЕНИЯ НАКАЗ...,1020202776714,2.0,2
4,101100000416000025,Право заключения государственного контракта на...,https://zakupki.kontur.ru/0101100000416000025,2016-05-30,Открытый конкурс,79196.94,79196.94,RUB,Размещение отменено,3.0,0.0,0.0,278103383,,УПРАВЛЕНИЕ ФЕДЕРАЛЬНОЙ СЛУЖБЫ ПО НАДЗОРУ В СФЕ...,1040204605154,2.0,2


In [3]:
# def GetBalancedSample(data, count):
#     cancel_count = min(count // 3, data[data.ResultClass == 2].shape[0])
#     success_count = min((count - cancel_count) // 2, data[data.ResultClass == 1].shape[0])
#     unsuccess_count = count - cancel_count - success_count
#     balanced_sample = pd.concat([data[data.ResultClass == 2][:cancel_count],
#                                  data[data.ResultClass == 1][:success_count],
#                                  data[data.ResultClass == 0][:unsuccess_count]])
#     print(balanced_sample.groupby('ResultClass').size())
#     return balanced_sample

# GetBalancedSample(data, 10000)

In [4]:
data.groupby('ResultClass').size()

ResultClass
0.0    3334
1.0    3333
2.0    3333
dtype: int64

In [5]:
data.corr()

Unnamed: 0,Amount,RubPrice,StatusCode,SuppliersCount,IsWinner,ResultClass,Ogrn1
Amount,1.0,1.0,0.049623,-0.00732,-0.008976,0.049587,0.00286
RubPrice,1.0,1.0,0.049623,-0.00732,-0.008976,0.049587,0.00286
StatusCode,0.049623,0.049623,1.0,-0.397528,-0.456661,0.866025,0.476021
SuppliersCount,-0.00732,-0.00732,-0.397528,1.0,0.811654,-0.006131,-0.152609
IsWinner,-0.008976,-0.008976,-0.456661,0.811654,1.0,0.033552,-0.164021
ResultClass,0.049587,0.049587,0.866025,-0.006131,0.033552,1.0,0.43844
Ogrn1,0.00286,0.00286,0.476021,-0.152609,-0.164021,0.43844,1.0


In [6]:
data_head = data
features = extract_features(data_head)[0]

oh CurrencyCode
oh ProcedureDisplayName
oh Ogrn
ng Title
ng ProcedureDisplayName
ng OrgName


In [10]:
features = pd.DataFrame(features)

In [11]:
features.fillna(0, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(features.drop('ResultClass', axis=1).values,
                            features.ResultClass.values,
                            test_size=0.3,
                            random_state=123456)

In [12]:
# features = pd.DataFrame(features)
# X_train = features.drop(['ResultClass'], 1)
# y_train = features.ResultClass

In [13]:
# rf = RF(n_estimators=10, n_jobs=-1)
# rf = rf.fit(x_train, y_train)
# np.mean(cross_val_score(rf, x_train, y_train))

In [14]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, label=y_train, free_raw_data=False)
lgb_eval = lgb.Dataset(X_test, label=y_test, reference=lgb_train, free_raw_data=False)

In [15]:
params = {
        'objective': 'multiclass',
        'metric': 'multi_error',
        'num_classes': 3,
        'feature_fraction': 0.9,
        'feature_fraction_seed': 123456,
        'bagging_fraction': 0.9,
        'bagging_freq': 1,
        'bagging_seed': 123456,
        'max_depth': 8,
        'learning_rate': 0.1,
        'min_data_in_leaf': 11,
        'num_iteration': 100,
        'num_threads': 2
    }

In [16]:
gbm = lgb.train(params,
                    lgb_train,
                    valid_sets=[lgb_train, lgb_eval])  # eval training data
#                     feature_name=features.drop('ResultClass', axis=1).columns.values)

gbm.save_model(f'result.txt')



[1]	training's multi_error: 0.0791429	valid_1's multi_error: 0.105667
[2]	training's multi_error: 0.073	valid_1's multi_error: 0.102667
[3]	training's multi_error: 0.0688571	valid_1's multi_error: 0.1
[4]	training's multi_error: 0.0714286	valid_1's multi_error: 0.0976667
[5]	training's multi_error: 0.0698571	valid_1's multi_error: 0.102
[6]	training's multi_error: 0.064	valid_1's multi_error: 0.098
[7]	training's multi_error: 0.0612857	valid_1's multi_error: 0.0943333
[8]	training's multi_error: 0.0622857	valid_1's multi_error: 0.0966667
[9]	training's multi_error: 0.0591429	valid_1's multi_error: 0.0926667
[10]	training's multi_error: 0.0574286	valid_1's multi_error: 0.094
[11]	training's multi_error: 0.0555714	valid_1's multi_error: 0.0916667
[12]	training's multi_error: 0.0555714	valid_1's multi_error: 0.0906667
[13]	training's multi_error: 0.0545714	valid_1's multi_error: 0.0936667
[14]	training's multi_error: 0.0541429	valid_1's multi_error: 0.091
[15]	training's multi_error: 0.05

In [17]:
num_round = 12
lgb.cv(params, lgb_train, num_round, nfold=5)

{'multi_error-mean': [0.094571897770293,
  0.09314220265341354,
  0.09228546650326525,
  0.08642658596426775,
  0.08785668778814179,
  0.08771250447749149,
  0.0881425037506047,
  0.08671291162149214,
  0.08557046162263562,
  0.08442780783327578,
  0.08428413552840516,
  0.08399974609946578],
 'multi_error-stdv': [0.007852546247737732,
  0.0055321708320201935,
  0.0037566893193274587,
  0.005463710803654207,
  0.002948385473230661,
  0.0033749924479041992,
  0.002179960694221576,
  0.0030103491501607006,
  0.0031491924351767786,
  0.003122618610625163,
  0.0044866421646252855,
  0.004051300436048929]}

In [36]:
from xgboost import XGBClassifier

# fit model no training data
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
np.mean(cross_val_score(xgb, X_train, y_train, cv=5))

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


0.9164262786033228

In [37]:
from sklearn.ensemble import RandomForestClassifier as RF

rf = RF(n_estimators=200, max_features='auto', n_jobs=-1)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [40]:
def get_metric(X_test, y_test, model):
    y_pred = None
    if "predict_proba" in dir(model):
        y_pred = model.predict_proba(X_test)
    else:
        y_pred = model.predict(X_test)
    test = pd.DataFrame(y_test, columns=['test'])
    test['prediction'] = [np.argmax(x) for x in y_pred]
    test['is_same'] = test.apply(lambda row: int(row[0] == row[1]), axis=1)
    f = {'is_same': ['sum', 'size', 'mean']}
    groups = test.groupby('test').agg(f)
    print(sum(test['is_same']) / test.shape[0])
    return groups['is_same'].sort_values('mean', ascending=False)

In [41]:
get_metric(X_test, y_test, gbm)

0.9186666666666666


Unnamed: 0_level_0,sum,size,mean
test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2.0,959,1025,0.93561
0.0,908,988,0.919028
1.0,889,987,0.900709


In [42]:
get_metric(X_test, y_test, xgb)

0.9083333333333333


Unnamed: 0_level_0,sum,size,mean
test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2.0,952,1025,0.92878
0.0,887,988,0.897773
1.0,886,987,0.89767


In [43]:
get_metric(X_test, y_test, rf)

0.9166666666666666


Unnamed: 0_level_0,sum,size,mean
test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2.0,950,1025,0.926829
1.0,904,987,0.915907
0.0,896,988,0.906883


In [None]:
# table = pd.crosstab(predicted, y_test, colnames=['Actual Results'], rownames=['Predicted Results'])
# table

In [None]:
# recall = {}
# precision = {}
# for column in table.columns:
#     recall[column] = table[column][column] / table.apply(lambda row: sum(row), axis=0)[column]
#     precision[column] = table[column][column] / table.apply(lambda row: sum(row), axis=1)[column]
# recall

In [None]:
# precision

In [None]:
# F = {}
# for column in table.columns:
#     F[column] = 2 * (recall[column] * precision[column]) / (recall[column] + precision[column])
# F