##Установка и импортирование библиотек##



In [2]:
!pip install catboost



In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score

import catboost as cb
from catboost import Pool, CatBoostClassifier
from sklearn.model_selection import GridSearchCV

from itertools import product, chain

#Требуется добавить файл paramsearch.py в проект
#from paramsearch import paramsearch

## Загрузка данных ##



In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
data = pd.read_csv('/content/drive/MyDrive/Хакатон/dataset2015_2019.csv')

In [None]:
data.head(2)

Unnamed: 0.1,Unnamed: 0,centroid,x,y,RotationLength,RotationCount,CODE_CULTU_2015,CODE_CULTU_2016,CODE_CULTU_2017,CODE_CULTU_2018,CODE_CULTU_2019,CODE_GROUP_2015,CODE_GROUP_2016,CODE_GROUP_2017,CODE_GROUP_2018,CODE_GROUP_2019,cultuCode,groupCode,hzs,kg_id,ff_id,1 Nearest Neigbour Point,2 Nearest Neigbour Point,3 Nearest Neigbour Point,4 Nearest Neigbour Point
0,0,Point (907753.82834152 6554634.46488151),907753.828342,6554634.0,1,5,PPH,PPH,PPH,PPH,PPH,18,18,18,18,18,aaaaa,aaaaa,7b,Dfb,ff12,Point (907748.69100314 6554212.06490037),Point (907410.62452802 6554909.79074054),Point (908089.34400838 6554953.05455597),Point (907563.65755459 6554039.35762193)
1,1,Point (906932.34642625 6550537.14396784),906932.346426,6550537.0,1,5,PPH,PPH,PPH,PPH,PPH,18,18,18,18,18,aaaaa,aaaaa,7b,Dfb,ff12,Point (906980.7207283 6550304.97315954),Point (906736.32315476 6550335.52813463),Point (906595.72670333 6550598.70621165),Point (907032.95563938 6550121.6369531)


## Эвристики для предсказаний данных без модели ##

### Эвристика по закономерностям последовательностей культур на основе анализа *данных* ###

In [7]:
#Функция для предсказания культуры в следующем году на основании культур предыдущих 4 лет
def predict_next_culture_4_years_list(l0, l1, l2, l3):
  if l0 == l1 != l2 and l2 == l3:
     return 'Unknown'
  if l0 == l1 == l2 == l3:
     return l0
  if l0 == l2 and l1 == l3:
     return l0
  if l0 == l1 != l2 and l2 == l3:
     return l0
  return 'Unknown'

In [7]:
print(predict_next_culture_4_years_list(data.iloc[0]['CODE_CULTU_2015'],data.iloc[0]['CODE_CULTU_2016'],data.iloc[0]['CODE_CULTU_2017'],data.iloc[0]['CODE_CULTU_2018']))

PPH


### Эвристика по уникальным последовательностям ###

In [8]:
#Функция для получения списка уникальных последовательностей (после 4 культур предыдущих лет всегда идет одна и та же 5-ая)
def get_unique_chains(data):
  g = data.groupby(['CODE_CULTU_2015','CODE_CULTU_2016','CODE_CULTU_2017','CODE_CULTU_2018'])['CODE_CULTU_2019'].count()
  g = g.add_suffix('').reset_index()
  g_uniq = g[g['CODE_CULTU_2019'] == 1]
  g_pred = pd.merge(g_uniq, data,  how='left', left_on=['CODE_CULTU_2015','CODE_CULTU_2016','CODE_CULTU_2017','CODE_CULTU_2018'], right_on = ['CODE_CULTU_2015','CODE_CULTU_2016','CODE_CULTU_2017','CODE_CULTU_2018'])
  g_pred = g_pred.reindex(columns=['CODE_CULTU_2015','CODE_CULTU_2016','CODE_CULTU_2017','CODE_CULTU_2018','CODE_CULTU_2019_y'])
  g_pred.rename(columns = {'CODE_CULTU_2019_y': 'CODE_CULTU_2019_predict'}, inplace = True)
  return g_pred

In [29]:
#Функция для получения списка уникальных последовательностей для тестовой выборки
def get_unique_chains_test(data):
  g = data.groupby(['CODE_CULTU_2015','CODE_CULTU_2016','CODE_CULTU_2017','CODE_CULTU_2018']).count()
  g = g.add_suffix('').reset_index()
  return g

In [9]:
print(get_unique_chains(data))

      CODE_CULTU_2015 CODE_CULTU_2016  ... CODE_CULTU_2018 CODE_CULTU_2019_predict
0                 ACA             FLA  ...             FLA                     FLA
1                 ACA             FLP  ...             FLP                     FLP
2                 ACA             MIS  ...             SNE                     SNE
3                 ACA             SNE  ...             SNE                     SNE
4                 AGR             BOR  ...             SNE                     SNE
...               ...             ...  ...             ...                     ...
82546             VRT             VRT  ...             SOG                     BTH
82547             XFE             PTR  ...             BTH                     PTR
82548             XFE             PTR  ...             PPH                     PRL
82549             XFE             XFE  ...             PRL                     CZH
82550             XFE             XFE  ...             XFE                     MIE

[82

### Предсказания по эвристикам на обучающей выборке ###

In [10]:
g_pred = get_unique_chains(data)

In [11]:
#Получаем предсказания по эвристике по уникальным последовательностям
data_pred = pd.merge(data, g_pred,  how='left', left_on=['CODE_CULTU_2015','CODE_CULTU_2016','CODE_CULTU_2017','CODE_CULTU_2018'], right_on = ['CODE_CULTU_2015','CODE_CULTU_2016','CODE_CULTU_2017','CODE_CULTU_2018'])

In [12]:
print(data_pred['CODE_CULTU_2019_predict'].value_counts().sum())
print(data_pred['CODE_CULTU_2019_predict'].value_counts())

82551
BTH    11247
PTR     8134
LUZ     4978
MLG     4003
MIS     3902
       ...  
SFI        1
PIS        1
MRG        1
LFP        1
OEH        1
Name: CODE_CULTU_2019_predict, Length: 218, dtype: int64


In [13]:
#Сохраняем первую часть, предсказанную по эвристике по уникальным последовательностям
data_part1 = data_pred[data_pred.CODE_CULTU_2019_predict.notnull()].copy()

In [14]:
data_part1.head(2)

Unnamed: 0.1,Unnamed: 0,centroid,x,y,RotationLength,RotationCount,CODE_CULTU_2015,CODE_CULTU_2016,CODE_CULTU_2017,CODE_CULTU_2018,CODE_CULTU_2019,CODE_GROUP_2015,CODE_GROUP_2016,CODE_GROUP_2017,CODE_GROUP_2018,CODE_GROUP_2019,cultuCode,groupCode,hzs,kg_id,ff_id,1 Nearest Neigbour Point,2 Nearest Neigbour Point,3 Nearest Neigbour Point,4 Nearest Neigbour Point,CODE_CULTU_2019_predict
50,50,Point (859900.11098517 6591026.78278985),859900.110985,6591027.0,3,1,BTH,MID,MIS,MIS,BTH,1,2,2,2,1,abcca,abbba,8b,Cfa,ff13,Point (859812.49760217 6591185.50103266),Point (859719.0802948 6590975.56193138),Point (860081.26015362 6591098.77150968),Point (860019.18559373 6590736.93947271),BTH
74,74,Point (848022.43324874 6589869.38021483),848022.433249,6589869.0,4,1,AVP,SGH,MIS,MIS,ORP,4,4,2,2,3,abccd,aabbc,8b,Cfa,ff13,Point (848060.93345233 6589936.96049742),Point (847937.44700784 6589858.58590304),Point (848069.49079389 6590019.03797348),Point (848120.55312173 6589719.68664306),ORP


In [15]:
#Формируем вторую часть для предсказаний по эверистике по закономерностям последовательностей культур на основе анализа
data_part2 = data_pred[data_pred.CODE_CULTU_2019_predict.isna()].copy()

In [16]:
data_part2.head(2)

Unnamed: 0.1,Unnamed: 0,centroid,x,y,RotationLength,RotationCount,CODE_CULTU_2015,CODE_CULTU_2016,CODE_CULTU_2017,CODE_CULTU_2018,CODE_CULTU_2019,CODE_GROUP_2015,CODE_GROUP_2016,CODE_GROUP_2017,CODE_GROUP_2018,CODE_GROUP_2019,cultuCode,groupCode,hzs,kg_id,ff_id,1 Nearest Neigbour Point,2 Nearest Neigbour Point,3 Nearest Neigbour Point,4 Nearest Neigbour Point,CODE_CULTU_2019_predict
0,0,Point (907753.82834152 6554634.46488151),907753.828342,6554634.0,1,5,PPH,PPH,PPH,PPH,PPH,18,18,18,18,18,aaaaa,aaaaa,7b,Dfb,ff12,Point (907748.69100314 6554212.06490037),Point (907410.62452802 6554909.79074054),Point (908089.34400838 6554953.05455597),Point (907563.65755459 6554039.35762193),
1,1,Point (906932.34642625 6550537.14396784),906932.346426,6550537.0,1,5,PPH,PPH,PPH,PPH,PPH,18,18,18,18,18,aaaaa,aaaaa,7b,Dfb,ff12,Point (906980.7207283 6550304.97315954),Point (906736.32315476 6550335.52813463),Point (906595.72670333 6550598.70621165),Point (907032.95563938 6550121.6369531),


In [17]:
#Получаем предсказания по эвристике по закономерностям последовательностей культур на основе анализа данных
data_part2['CODE_CULTU_2019_predict'] = data_part2.apply(lambda x: predict_next_culture_4_years_list(x['CODE_CULTU_2015'],x['CODE_CULTU_2016'],x['CODE_CULTU_2017'],x['CODE_CULTU_2018']), axis=1, result_type='expand')

In [18]:
print(data_part2[data_part2['CODE_CULTU_2019_predict'] != 'Unknown']['CODE_CULTU_2019_predict'].value_counts().sum())
print(data_part2[data_part2['CODE_CULTU_2019_predict'] != 'Unknown']['CODE_CULTU_2019_predict'].value_counts())

2078679
PPH    1067851
VRC     181807
J6S     100032
SNE      85849
PRL      79558
        ...   
LDP          2
CHR          2
PAN          2
EST          2
BAS          2
Name: CODE_CULTU_2019_predict, Length: 159, dtype: int64


In [28]:
#Формирование выборки для оценки точности
data_eval = data_part2[data_part2['CODE_CULTU_2019_predict'] != 'Unknown']

In [29]:
print(accuracy_score(data_eval['CODE_CULTU_2019'],data_eval['CODE_CULTU_2019_predict']))

0.9479198086861896


In [19]:
# Формирование набора данных, по которому не удалось предсказать значения с помощью эвристик ###
data_part3 = data_part2[data_part2['CODE_CULTU_2019_predict'] == 'Unknown']

In [20]:
#Формирование набора данных с предсказанными по эвристикам значениями
data_part2 = data_part2[data_part2['CODE_CULTU_2019_predict'] != 'Unknown']

## Моделирование ##

### Train/Val CatBoost###

In [34]:
#Признаки и целевая переменная для модели
"""
Полный список колонок в датасете:
'RotationLength', 'RotationCount',
'CODE_CULTU_2015', 'CODE_CULTU_2016', 'CODE_CULTU_2017',
'CODE_CULTU_2018', 'CODE_CULTU_2019', 'CODE_GROUP_2015',
'CODE_GROUP_2016', 'CODE_GROUP_2017', 'CODE_GROUP_2018',
'CODE_GROUP_2019', 'cultuCode', 'groupCode', 'hzs', 'kg_id', 'ff_id'
"""

X = data_part3[['RotationLength', 'RotationCount',
       'CODE_CULTU_2015', 'CODE_CULTU_2016', 'CODE_CULTU_2017',
       'CODE_CULTU_2018', 'CODE_GROUP_2015',
       'CODE_GROUP_2016', 'CODE_GROUP_2017', 'CODE_GROUP_2018',
       'cultuCode', 'groupCode', 'hzs', 'kg_id', 'ff_id']]

X = X.replace(np.nan, 'unknown', regex=True)

Y = data_part3['CODE_CULTU_2019']

In [35]:
X.shape

(1453574, 15)

In [40]:
X.isnull().sum(axis = 0)

RotationLength     0
RotationCount      0
CODE_CULTU_2015    0
CODE_CULTU_2016    0
CODE_CULTU_2017    0
CODE_CULTU_2018    0
CODE_GROUP_2015    0
CODE_GROUP_2016    0
CODE_GROUP_2017    0
CODE_GROUP_2018    0
cultuCode          0
groupCode          0
hzs                0
kg_id              0
ff_id              0
dtype: int64

In [36]:
#Разделение выборок на обучающую и валидационную
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=286)

In [37]:
X.head(2)

Unnamed: 0,RotationLength,RotationCount,CODE_CULTU_2015,CODE_CULTU_2016,CODE_CULTU_2017,CODE_CULTU_2018,CODE_GROUP_2015,CODE_GROUP_2016,CODE_GROUP_2017,CODE_GROUP_2018,cultuCode,groupCode,hzs,kg_id,ff_id
12,3,1,TTH,ORH,TTH,PTR,4,3,4,19,abacc,abacc,8b,Cfb,ff13
18,3,1,ORH,ORH,TTH,SGH,3,3,4,4,aabcb,aabbb,8b,Cfb,ff13


In [41]:
#Список категориальных переменных для модели
cat_features = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

In [43]:
train_dataset = Pool(data=X_train,
                     label=Y_train,
                     cat_features=cat_features)

eval_dataset = Pool(data=X_val,
                    label=Y_val,
                    cat_features=cat_features)

model = CatBoostClassifier(iterations=100,
                           depth=4,
                           loss_function='MultiClass',  task_type='GPU')

#Обучение модели
model.fit(train_dataset)  
#Получение предсказаний
preds_class = model.predict(eval_dataset)

Learning rate set to 0.5
0:	learn: 13.9392325	total: 6.8s	remaining: 11m 13s
1:	learn: 2140.9016226	total: 14.1s	remaining: 11m 33s
2:	learn: 5977.3909373	total: 21s	remaining: 11m 18s
3:	learn: 6921.1041820	total: 27.8s	remaining: 11m 7s
4:	learn: 7777.9509158	total: 34.6s	remaining: 10m 57s
5:	learn: 7458.7808152	total: 41.4s	remaining: 10m 49s
6:	learn: 6919.1519178	total: 48.3s	remaining: 10m 41s
7:	learn: 6588.2061505	total: 55.1s	remaining: 10m 33s
8:	learn: 5837.8916653	total: 1m 1s	remaining: 10m 25s
9:	learn: 5566.6449398	total: 1m 8s	remaining: 10m 18s
10:	learn: 5262.7257475	total: 1m 15s	remaining: 10m 11s
11:	learn: 4851.5522243	total: 1m 22s	remaining: 10m 4s
12:	learn: 4746.1885164	total: 1m 29s	remaining: 9m 57s
13:	learn: 4646.1735017	total: 1m 36s	remaining: 9m 49s
14:	learn: 4494.6247000	total: 1m 42s	remaining: 9m 42s
15:	learn: 4378.0731181	total: 1m 49s	remaining: 9m 35s
16:	learn: 4098.0297560	total: 1m 56s	remaining: 9m 28s
17:	learn: 3951.0203868	total: 2m 3s	r

In [44]:
print(accuracy_score(Y_val,preds_class))

0.6135080749187348


In [45]:
#Предсказание по всей выборке без разделения на обучающую и валидационную
eval_dataset = Pool(data=X,
                    label=Y,
                    cat_features=cat_features)

preds_class = model.predict(eval_dataset)

In [46]:
#Оценка точности
print(accuracy_score(Y,preds_class))

0.6139625502382403


In [47]:
#Добавление предсказаний от модели
data_part3['CODE_CULTU_2019_predict'] = preds_class

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [49]:
final_data = pd.concat([data_part1,data_part2,data_part3])

In [50]:
print(accuracy_score(final_data['CODE_CULTU_2019'],final_data['CODE_CULTU_2019_predict']))

0.8148192820412946


Корректировка явных ошибок CatBoost (Предсказание значения, которое не встречалось в выборке, заменяем на последнее значение в выборке)

In [52]:
data_part3_1 = data_part3.loc[(data_part3['CODE_CULTU_2019_predict'] != data_part3['CODE_CULTU_2015']) & (data_part3['CODE_CULTU_2019_predict'] != data_part3['CODE_CULTU_2016']) & (data_part3['CODE_CULTU_2019_predict'] != data_part3['CODE_CULTU_2017']) & (data_part3['CODE_CULTU_2019_predict'] != data_part3['CODE_CULTU_2018']) ]

In [53]:
data_part3_2 = data_part3.loc[(data_part3['CODE_CULTU_2019_predict'] == data_part3['CODE_CULTU_2015']) | (data_part3['CODE_CULTU_2019_predict'] == data_part3['CODE_CULTU_2016']) | (data_part3['CODE_CULTU_2019_predict'] == data_part3['CODE_CULTU_2017']) | (data_part3['CODE_CULTU_2019_predict'] == data_part3['CODE_CULTU_2018']) ]

In [54]:
data_part3_1['CODE_CULTU_2019_predict'] = data_part3_1['CODE_CULTU_2018']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [55]:
#Объединение данных
final_data = pd.concat([data_part1,data_part2,data_part3_1,data_part3_2])

In [56]:
#Оценка точности
print(accuracy_score(final_data['CODE_CULTU_2019'],final_data['CODE_CULTU_2019_predict']))

0.8190662619605378


###Full CatBoost###

In [21]:
X_full = data_part3[['RotationLength', 'RotationCount',
       'CODE_CULTU_2015', 'CODE_CULTU_2016', 'CODE_CULTU_2017',
       'CODE_CULTU_2018', 'CODE_GROUP_2015',
       'CODE_GROUP_2016', 'CODE_GROUP_2017', 'CODE_GROUP_2018',
       'hzs', 'kg_id', 'ff_id']]

X_full = X_full.replace(np.nan, 'unknown', regex=True)

Y_full = data_part3['CODE_CULTU_2019']

In [22]:
cat_features_full = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [None]:
train_dataset_full = Pool(data=X_full,
                     label=Y_full,
                     cat_features=cat_features_full)

eval_dataset_full = Pool(data=X_full,
                    label=Y_full,
                    cat_features=cat_features_full)

model_full = CatBoostClassifier(iterations=200,
                           #learning_rate=0.25,
                           depth=6,
                           loss_function='MultiClass',  task_type='GPU')

#Обучение модели
model_full.fit(train_dataset_full)  
#Получение предсказаний
preds_class_full = model_full.predict(eval_dataset_full)

In [25]:
print(accuracy_score(Y_full,preds_class_full))

0.572658839522446


### Full CatBoost GridSearch ###

In [76]:
def crossvaltest(params,train_set,train_label,cat_dims,n_splits=3):
    kf = KFold(n_splits=n_splits,shuffle=True) 
    res = []
    for train_index, test_index in kf.split(train_set):
        train = train_set.iloc[train_index,:]
        test = train_set.iloc[test_index,:]

        #labels = train_label.ix[train_index]
        #test_labels = train_label.ix[test_index]

        labels = train_label.iloc[train_index]
        test_labels = train_label.iloc[test_index]

        clf = cb.CatBoostClassifier(**params)
        clf.fit(train, np.ravel(labels), cat_features=cat_dims)

        res.append(np.mean(clf.predict(test)==np.ravel(test_labels)))
    return np.mean(res)

In [92]:
def catboost_param_tune(params,train_set,train_label,cat_dims=None,n_splits=3):
    ps = paramsearch(params)
    # search 'border_count', 'l2_leaf_reg' etc. individually 
    #   but 'iterations','learning_rate' together
    for prms in chain(#ps.grid_search(['border_count']),
                      #ps.grid_search(['ctr_border_count']),
                      ps.grid_search(['l2_leaf_reg']),
                      ps.grid_search(['iterations','learning_rate']),
                      ps.grid_search(['depth'])):
        res = crossvaltest(prms,train_set,train_label,cat_dims,n_splits)
        # save the crossvalidation result so that future iterations can reuse the best parameters
        ps.register_result(res,prms)
        print(res,prms,'best:',ps.bestscore(),ps.bestparam())
    return ps.bestparam()

In [91]:
"""
params = {'depth':[1,2,3,4,5,6,7],
          'iterations':[100,200,500,1000],
          'learning_rate':[0.001,0.01,0.1,0.2,0.3],
          'l2_leaf_reg':[1,3,5,10,100],
          'border_count':[5,10,20,50,100,200],
          'thread_count':4}
"""

"""
params = {'depth':[1,2,3,4,5,6,7],
          'iterations':[100,200,500,1000],
          'learning_rate':[0.001,0.01,0.1,0.2,0.3],
          'l2_leaf_reg':[1,3,5,10,100],
          'border_count':[5,10,20,50,100,200],
          'loss_function':['MultiClass'],
          'task_type':['GPU']}
"""

params = {'depth':[4,5,6,7,8,9,10],
          'iterations':[100,200,500,1000],
          'learning_rate':[0.001,0.01,0.1,0.2,0.3],
          'l2_leaf_reg':[1,3,5,10,100],
          'loss_function':['MultiClass'],
          'task_type':['GPU']}

In [None]:
bestparams = catboost_param_tune(params,X_full,Y_full,cat_features_full)
print(bestparams)

0:	learn: 5.2785685	total: 5.36s	remaining: 8m 50s
1:	learn: 5.2437297	total: 10.7s	remaining: 8m 43s
2:	learn: 5.2151785	total: 16s	remaining: 8m 38s
3:	learn: 5.1913918	total: 21.3s	remaining: 8m 32s
4:	learn: 5.1497602	total: 26.7s	remaining: 8m 26s
5:	learn: 5.1206069	total: 32s	remaining: 8m 21s
6:	learn: 5.0966148	total: 37.3s	remaining: 8m 15s
7:	learn: 5.0752588	total: 42.7s	remaining: 8m 10s
8:	learn: 5.0624323	total: 48s	remaining: 8m 5s
9:	learn: 5.0499959	total: 53.3s	remaining: 7m 59s
10:	learn: 5.0378608	total: 58.6s	remaining: 7m 54s
11:	learn: 4.9999613	total: 1m 3s	remaining: 7m 49s
12:	learn: 4.9647035	total: 1m 9s	remaining: 7m 43s
13:	learn: 4.9316227	total: 1m 14s	remaining: 7m 38s
14:	learn: 4.9004653	total: 1m 19s	remaining: 7m 33s
15:	learn: 4.8709487	total: 1m 25s	remaining: 7m 27s
16:	learn: 4.8429027	total: 1m 30s	remaining: 7m 22s
17:	learn: 4.8161693	total: 1m 35s	remaining: 7m 17s
18:	learn: 4.7906159	total: 1m 41s	remaining: 7m 11s
19:	learn: 4.7661377	to

## Предсказания по тестовой выборке ##

In [26]:
test2015_2018 = pd.read_csv('/content/drive/MyDrive/Хакатон/testdataset2015-2018.csv')

In [27]:
test2015_2018.head(2)

Unnamed: 0,CODE_CULTU_2018,CODE_GROUP_2018,centroid,CODE_CULTU_2017,CODE_GROUP_2017,CODE_CULTU_2016,CODE_GROUP_2016,CODE_CULTU_2015,CODE_GROUP_2015,cultuCode,groupCode,RotationLength,RotationCount,hzs,kg_id,ff_id
0,J6S,11,Point (837113.21815327 6542642.03294466),J6S,11,J6S,11,J6S,11,aaaa,aaaa,1,4,8b,Cfb,ff13
1,PTR,19,Point (859951.6897682 6548701.23025595),PTR,19,PTR,19,BTH,1,aaab,aaab,2,1,8b,Cfb,ff13


In [30]:
g_test_pred = get_unique_chains_test(test2015_2018)

In [31]:
#Получает предсказания для тестовой выборки из уникальных последовательностей обучающей выборки
g_test_pred = pd.merge(g_test_pred, g_pred,  how='left', left_on=['CODE_CULTU_2015','CODE_CULTU_2016','CODE_CULTU_2017','CODE_CULTU_2018'], right_on = ['CODE_CULTU_2015','CODE_CULTU_2016','CODE_CULTU_2017','CODE_CULTU_2018'])


In [33]:
g_test_pred = g_test_pred.reindex(columns=['CODE_CULTU_2015','CODE_CULTU_2016','CODE_CULTU_2017','CODE_CULTU_2018','CODE_CULTU_2019_predict'])

In [34]:
print(g_test_pred['CODE_CULTU_2019_predict'].value_counts().sum())
print(g_test_pred['CODE_CULTU_2019_predict'].value_counts())

4254
BTH    683
PTR    444
LUZ    250
MLG    219
MIS    215
      ... 
RUT      1
AGR      1
CHT      1
LEF      1
PFP      1
Name: CODE_CULTU_2019_predict, Length: 144, dtype: int64


In [35]:
#Получаем предсказания по эвристике по уникальным последовательностям для тестовой выборки
test_pred = pd.merge(test2015_2018, g_test_pred,  how='left', left_on=['CODE_CULTU_2015','CODE_CULTU_2016','CODE_CULTU_2017','CODE_CULTU_2018'], right_on = ['CODE_CULTU_2015','CODE_CULTU_2016','CODE_CULTU_2017','CODE_CULTU_2018'])

In [37]:
print(test_pred['CODE_CULTU_2019_predict'].value_counts().sum())
print(test_pred['CODE_CULTU_2019_predict'].value_counts())

4554
BTH    735
PTR    462
LUZ    263
MLG    244
ORH    225
      ... 
CSS      1
LDP      1
CCT      1
PAT      1
LEF      1
Name: CODE_CULTU_2019_predict, Length: 144, dtype: int64


In [41]:
#Сохраняем первую часть, предсказанную по эвристике по уникальным последовательностям
test_part1 = test_pred[test_pred.CODE_CULTU_2019_predict.notnull()].copy()

In [42]:
test_part1.head(2)

Unnamed: 0,CODE_CULTU_2018,CODE_GROUP_2018,centroid,CODE_CULTU_2017,CODE_GROUP_2017,CODE_CULTU_2016,CODE_GROUP_2016,CODE_CULTU_2015,CODE_GROUP_2015,cultuCode,groupCode,RotationLength,RotationCount,hzs,kg_id,ff_id,CODE_CULTU_2019_predict
97,ORP,3,Point (924172.59491668 6566269.23410383),PTR,19,MLG,16,PTR,19,abcb,abcb,3,1,8b,Dfb,ff12,BTH
287,MIE,2,Point (910788.69124593 6541905.66972968),ORH,3,TTH,4,PPH,18,abcd,abcd,4,1,8a,Dfb,ff12,MIE


In [53]:
#Формируем вторую часть для предсказаний по эверистике по закономерностям последовательностей культур на основе анализа
test_part2 = test_pred[test_pred.CODE_CULTU_2019_predict.isna()].copy()

In [54]:
#Получаем предсказания по эвристике по закономерностям последовательностей культур на основе анализа данных
test_part2['CODE_CULTU_2019_predict'] = test_part2.apply(lambda x: predict_next_culture_4_years_list(x['CODE_CULTU_2015'],x['CODE_CULTU_2016'],x['CODE_CULTU_2017'],x['CODE_CULTU_2018']), axis=1, result_type='expand')

In [45]:
print(test_part2['CODE_CULTU_2019_predict'].value_counts().sum())
print(test_part2['CODE_CULTU_2019_predict'].value_counts())

397091
Unknown    166549
PPH        118597
VRC         19986
J6S         11119
SNE          9443
            ...  
FNO             1
CMM             1
TTP             1
PAN             1
PAG             1
Name: CODE_CULTU_2019_predict, Length: 145, dtype: int64


In [56]:
#Формирование набора данных, по которому не удалось предсказать значения с помощью эвристик ###
test_part3 = test_part2[test_part2['CODE_CULTU_2019_predict'] == 'Unknown']
test_part3 = test_part3.add_suffix('').reset_index()

In [57]:
#Формирование набора данных с предсказанными по эвристикам значениями
test_part2 = test_part2[test_part2['CODE_CULTU_2019_predict'] != 'Unknown']

In [58]:
#Признаки для модели для тестовой выборки
X_test = test_part3[['RotationLength', 'RotationCount',
       'CODE_CULTU_2015', 'CODE_CULTU_2016', 'CODE_CULTU_2017',
       'CODE_CULTU_2018', 'CODE_GROUP_2015',
       'CODE_GROUP_2016', 'CODE_GROUP_2017', 'CODE_GROUP_2018',
       'hzs', 'kg_id', 'ff_id']]

X_test = X_test.replace(np.nan, 'unknown', regex=True)

cat_features_test = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [60]:
#Получение предсказаний
test_dataset = Pool(data=X_test,
                    cat_features=cat_features_test)

preds_class_test = model_full.predict(test_dataset)

In [61]:
test_part3['CODE_CULTU_2019_predict'] = preds_class_test

Корректировка явных ошибок CatBoost (Предсказание значения, которое не встречалось в выборке, заменяем на последнее значение в выборке)

In [62]:
test_part3_1 = test_part3.loc[(test_part3['CODE_CULTU_2019_predict'] != test_part3['CODE_CULTU_2015']) & (test_part3['CODE_CULTU_2019_predict'] != test_part3['CODE_CULTU_2016']) & (test_part3['CODE_CULTU_2019_predict'] != test_part3['CODE_CULTU_2017']) & (test_part3['CODE_CULTU_2019_predict'] != test_part3['CODE_CULTU_2018']) ]

In [63]:
test_part3_2 = test_part3.loc[(test_part3['CODE_CULTU_2019_predict'] == test_part3['CODE_CULTU_2015']) | (test_part3['CODE_CULTU_2019_predict'] == test_part3['CODE_CULTU_2016']) | (test_part3['CODE_CULTU_2019_predict'] == test_part3['CODE_CULTU_2017']) | (test_part3['CODE_CULTU_2019_predict'] == test_part3['CODE_CULTU_2018']) ]

In [64]:
test_part3_1['CODE_CULTU_2019_predict'] = test_part3_1['CODE_CULTU_2018']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [67]:
#Объединение данных
final_data_test = pd.concat([test_part1,test_part2,test_part3])
final_data_test = final_data_test[['centroid','CODE_CULTU_2019_predict']]
final_data_test.rename(columns = {'CODE_CULTU_2019_predict': 'CODE_CULTU_2019'}, inplace = True)

In [None]:
#Формирование файла ответов
pd.DataFrame(final_data_test[['centroid','CODE_CULTU_2019']]).to_csv("/content/drive/MyDrive/Хакатон/predict2019.csv",index=False,header=True)