# Process data

In [1]:
import pandas as pd
import os


# Load data
plavki_train = pd.read_csv('data/raw/plavki_train.csv', index_col='NPLV')
plavki_test = pd.read_csv('data/raw/plavki_test.csv', index_col='NPLV')

chugun_train  = pd.read_csv('data/raw/chugun_train.csv', index_col='NPLV')
chugun_test  = pd.read_csv('data/raw/chugun_test.csv', index_col='NPLV')
chugun_train = chugun_train.add_prefix('chugun_')
chugun_test = chugun_test.add_prefix('chugun_')

sip_train = pd.read_csv('data/raw/sip_train.csv')
sip_test = pd.read_csv('data/raw/sip_test.csv')

lom_train = pd.read_csv('data/raw/lom_train.csv')
lom_test = pd.read_csv('data/raw/lom_test.csv')

produv_train = pd.read_csv('data/raw/produv_train.csv')
produv_test = pd.read_csv('data/raw/produv_test.csv')

target_train = pd.read_csv('data/raw/target_train.csv', index_col='NPLV')
sample_submission = pd.read_csv('data/raw/sample_submission.csv', index_col='NPLV')


# drop many plavk
mask = (plavki_train.index == 511135) & (plavki_train['plavka_ST_FURM'] != 46)
plavki_train = plavki_train[~mask]


# process lom
lom_train = lom_train.drop(columns='NML')
lom_test = lom_test.drop(columns='NML')

lom_train = pd.pivot(lom_train, index='NPLV', columns='VDL', values='VES').fillna(0).astype(int).add_prefix('lom_VES_VLD')
lom_test = pd.pivot(lom_test, index='NPLV', columns='VDL', values='VES').fillna(0).astype(int).add_prefix('lom_VES_VLD')

lom_train['lom_components'] = lom_train.apply(lambda x: (x > 0).sum(), axis=1)
lom_test['lom_components'] = lom_test.apply(lambda x: (x > 0).sum(), axis=1)

lom_train['lom_VES_sum'] = lom_train[[x for x in lom_train.columns if x.startswith('lom_VES_VLD')]].apply(lambda x: x.sum(), axis=1)

lom_test['lom_VES_sum'] = lom_test[[x for x in lom_test.columns if x.startswith('lom_VES_VLD')]].apply(lambda x: x.sum(), axis=1)

## remove 1 column
lom_train = lom_train[lom_test.columns]


# process sip
sip_train = sip_train.drop(columns=['NMSYP', 'DAT_OTD'])
sip_test = sip_test.drop(columns=['NMSYP', 'DAT_OTD'])
sip_train = sip_train.groupby(['NPLV', 'VDSYP'])['VSSYP'].sum().reset_index()
sip_test = sip_test.groupby(['NPLV', 'VDSYP'])['VSSYP'].sum().reset_index()
sip_train = pd.pivot(sip_train, index='NPLV', columns='VDSYP', values='VSSYP').fillna(0).astype(int).add_prefix('sip_VES_VDSYP')
sip_test = pd.pivot(sip_test, index='NPLV', columns='VDSYP', values='VSSYP').fillna(0).astype(int).add_prefix('sip_VES_VDSYP')

## Remove columns
sip_test = sip_test[sip_train.columns]

sip_train['sip_components'] = sip_train.apply(lambda x: (x > 0).sum(), axis=1)
sip_test['sip_components'] = sip_test.apply(lambda x: (x > 0).sum(), axis=1)


# process produv
min_pol_train = produv_train.groupby('NPLV')['POL'].min().to_frame().add_prefix('produv_min_')
mean_pol_train = produv_train.groupby('NPLV')['POL'].mean().to_frame().add_prefix('produv_mean_')
max_pol_train = produv_train.groupby('NPLV')['POL'].max().to_frame().add_prefix('produv_max_')

min_pol_test = produv_test.groupby('NPLV')['POL'].min().to_frame().add_prefix('produv_min_')
mean_pol_test = produv_test.groupby('NPLV')['POL'].mean().to_frame().add_prefix('produv_mean_')
max_pol_test = produv_test.groupby('NPLV')['POL'].max().to_frame().add_prefix('produv_max_')

ras_train = produv_train.groupby('NPLV')['RAS'].sum().to_frame().add_prefix('produv_sum_')
ras_test = produv_test.groupby('NPLV')['RAS'].sum().to_frame().add_prefix('produv_sum_')

ras_train_mean = produv_train.groupby('NPLV')['RAS'].mean().to_frame().add_prefix('produv_mean_')
ras_test_mean = produv_test.groupby('NPLV')['RAS'].mean().to_frame().add_prefix('produv_mean_')

produv_train_groupped = pd.concat([max_pol_train, mean_pol_train, min_pol_train, ras_train_mean, ras_train], axis=1)
produv_test_groupped = pd.concat([max_pol_test, mean_pol_test, min_pol_test, ras_test_mean, ras_test], axis=1)


# Load data 2
chronom_train = pd.read_csv('data/raw/chronom_train.csv', index_col=0, parse_dates=['VR_NACH', 'VR_KON'])
chronom_test = pd.read_csv('data/raw/chronom_test.csv', index_col=0, parse_dates=['VR_NACH', 'VR_KON'])
gas_train = pd.read_csv('data/raw/gas_train.csv')
gas_test = pd.read_csv('data/raw/gas_test.csv')

# process train
chronom_povalka_start_train = chronom_train[chronom_train['NOP']=='Продувка'].groupby('NPLV')['VR_KON'].max()
chronom_povalka_start_train.columns = ['chronom_povalka_start']
chronom_produvka_start_train = chronom_train[chronom_train['NOP']=='Продувка'].groupby('NPLV')['VR_NACH'].min()
chronom_produvka_start_train.columns = ['chronom_produvka_start']

chronom__df_train = pd.merge(chronom_povalka_start_train,chronom_produvka_start_train, on='NPLV')
chronom__df_train.columns = ['chronom_povalka_start', 'chronom_produvka_start']

gas_train['Time'] = pd.to_datetime(gas_train['Time'])
gas_train = gas_train.set_index(gas_train['Time'])

data = []
for n in chronom__df_train.index:
    idx = gas_train[gas_train['NPLV']==n].index.get_loc(chronom__df_train.loc[n]['chronom_povalka_start'], method='nearest')
    ___df = gas_train[gas_train['NPLV']==n].iloc[idx]
    data.append([___df['NPLV'],___df['T']])
    
temp_gaz_train = pd.DataFrame(data, columns=['NPLV','gas_T_start_povalka'])


# process test
chronom_povalka_start_test = chronom_test[chronom_test['NOP']=='Продувка'].groupby('NPLV')['VR_KON'].max()
chronom_povalka_start_test.columns = ['chronom_povalka_start']
chronom_produvka_start_test = chronom_test[chronom_test['NOP']=='Продувка'].groupby('NPLV')['VR_NACH'].min()
chronom_produvka_start_test.columns = ['chronom_produvka_start']

chronom__df_test = pd.merge(chronom_povalka_start_test,chronom_produvka_start_test, on='NPLV')
chronom__df_test.columns = ['chronom_povalka_start', 'chronom_produvka_start']

gas_test['Time'] = pd.to_datetime(gas_test['Time'])
gas_test = gas_test.set_index(gas_test['Time'])

data = []
for n in chronom__df_test.index:
    idx = gas_test[gas_test['NPLV']==n].index.get_loc(chronom__df_test.loc[n]['chronom_povalka_start'], method='nearest')
    ___df = gas_test[gas_test['NPLV']==n].iloc[idx]
    data.append([___df['NPLV'],___df['T']])
    
temp_gaz_test = pd.DataFrame(data, columns=['NPLV','gas_T_start_povalka'])


## process chronom
chronom_train = pd.read_csv('data/raw/chronom_train.csv', index_col=0, parse_dates=['VR_NACH', 'VR_KON'])
chronom_test = pd.read_csv('data/raw/chronom_test.csv', index_col=0, parse_dates=['VR_NACH', 'VR_KON'])

chronom_train['operation_time'] = (chronom_train['VR_KON'] - chronom_train['VR_NACH']).dt.total_seconds()
chronom_test['operation_time'] = (chronom_test['VR_KON'] - chronom_test['VR_NACH']).dt.total_seconds()

chronom_train = chronom_train.groupby(['NPLV', 'NOP'])['operation_time'].mean().reset_index()
chronom_test = chronom_test.groupby(['NPLV', 'NOP'])['operation_time'].mean().reset_index()

chronom_train = pd.pivot(chronom_train, index='NPLV', columns='NOP', values='operation_time').fillna(0).astype(int)
chronom_test = pd.pivot(chronom_test, index='NPLV', columns='NOP', values='operation_time').fillna(0).astype(int)

common_columns = [x for x in chronom_test.columns if x in chronom_train.columns]

chronom_train = chronom_train[common_columns]
chronom_test = chronom_test[common_columns]

chronom_train = chronom_train.add_prefix('chronom_time_')
chronom_test = chronom_test.add_prefix('chronom_time_')


# process gas

gas_train = pd.read_csv('data/raw/gas_train.csv')
gas_test = pd.read_csv('data/raw/gas_test.csv')

gas_train = gas_train.drop(columns='Time')
gas_test = gas_test.drop(columns='Time')

_gas_train = gas_train.groupby(['NPLV']).agg(
    {
        'V':['mean', 'sum'],
        'T':['mean','sum','max'],
        'O2':['mean', 'sum'],
        'N2':['mean', 'sum'],
        'H2':['mean', 'sum'],
        'CO2':['mean', 'sum'],
        'CO':['mean', 'sum'],
        'AR':['mean', 'sum'],
        'T фурмы 1':['mean', 'sum'],
        'T фурмы 2':['mean', 'sum'],
        'O2_pressure':['mean', 'sum'],
    })
_gas_train.columns = ["_".join(x) for x in _gas_train.columns.ravel()]

_gas_test = gas_test.groupby(['NPLV']).agg(
    {
        'V':['mean', 'sum'],
        'T':['mean','sum','max'],
        'O2':['mean', 'sum'],
        'N2':['mean', 'sum'],
        'H2':['mean', 'sum'],
        'CO2':['mean', 'sum'],
        'CO':['mean', 'sum'],
        'AR':['mean', 'sum'],
        'T фурмы 1':['mean', 'sum'],
        'T фурмы 2':['mean', 'sum'],
        'O2_pressure':['mean', 'sum'],
    })
_gas_test.columns = ["_".join(x) for x in _gas_test.columns.ravel()]


gas_train = _gas_train.add_prefix('gas_')
gas_test = _gas_test.add_prefix('gas_')



# Merge
train = pd.merge(target_train, chugun_train, left_index=True, right_index=True, how='outer')
train = pd.merge(train, plavki_train, left_index=True, right_index=True, how='outer')
train = pd.merge(train, lom_train, left_index=True, right_index=True, how='outer')
train = pd.merge(train, sip_train, left_index=True, right_index=True, how='outer')
train = pd.merge(train, produv_train_groupped, left_index=True, right_index=True, how='outer')
train = pd.merge(train, gas_train, left_index=True, right_index=True, how='outer')
train = pd.merge(train, chronom_train, left_index=True, right_index=True, how='outer')

train = pd.merge(train, temp_gaz_train.set_index('NPLV'), left_index=True, right_index=True, how='outer')

test = pd.merge(sample_submission, chugun_test, left_index=True, right_index=True, how='outer')
test = pd.merge(test, plavki_test, left_index=True, right_index=True, how='outer')
test = pd.merge(test, lom_test, left_index=True, right_index=True, how='outer')
test = pd.merge(test, sip_test, left_index=True, right_index=True, how='outer')
test = pd.merge(test, produv_test_groupped, left_index=True, right_index=True, how='outer')
test = pd.merge(test, gas_test, left_index=True, right_index=True, how='outer')
test = pd.merge(test, chronom_test, left_index=True, right_index=True, how='outer')

test = pd.merge(test, temp_gaz_test.set_index('NPLV'), left_index=True, right_index=True, how='outer')

print("Test Nas:", test.isna().sum().sum())
print("Test Shape:", test.shape)


# Feature generation
train['chugun_DATA_ZAMERA'] = pd.to_datetime(train['chugun_DATA_ZAMERA'])
train['plavka_VR_NACH'] = pd.to_datetime(train['plavka_VR_NACH'])
train['plavka_VR_KON'] = pd.to_datetime(train['plavka_VR_KON'])

test['chugun_DATA_ZAMERA'] = pd.to_datetime(test['chugun_DATA_ZAMERA'])
test['plavka_VR_NACH'] = pd.to_datetime(test['plavka_VR_NACH'])
test['plavka_VR_KON'] = pd.to_datetime(test['plavka_VR_KON'])

train['timer'] = (train['chugun_DATA_ZAMERA'] - train['plavka_VR_NACH']).dt.total_seconds()
test['timer'] = (test['chugun_DATA_ZAMERA'] - test['plavka_VR_NACH']).dt.total_seconds()

train['VES/O2'] = train['chugun_VES'] / train['gas_O2_sum']
test['VES/O2'] = test['chugun_VES'] / test['gas_O2_sum']


train = train.dropna()

train.to_csv('data/processed/train_final.csv')
test.to_csv('data/processed/test_final.csv')

print("Train Nas:", train.isna().sum().sum())
print("Train Shape:", train.shape)

  _gas_train.columns = ["_".join(x) for x in _gas_train.columns.ravel()]


Test Nas: 0
Test Shape: (780, 98)
Train Nas: 0
Train Shape: (2061, 100)


  _gas_test.columns = ["_".join(x) for x in _gas_test.columns.ravel()]


# Библиотеки

In [2]:
import sys
import random
import numpy as np
import pandas as pd
import warnings
from pandas.api.types import is_numeric_dtype
from catboost import CatBoostRegressor, Pool
from IPython.display import display, Markdown
from sklearn.preprocessing import PowerTransformer, QuantileTransformer

warnings.filterwarnings('ignore')

In [3]:
# Fix seeds

# Set a seed value
seed_value= 12321 
# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)
# 2. Set `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)
# 3. Set `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# Загрузка данных

In [4]:
%%time
train = pd.read_csv('data/processed/train_final.csv', parse_dates=['chugun_DATA_ZAMERA', 'plavka_VR_NACH', 'plavka_VR_KON'])
test = pd.read_csv('data/processed/test_final.csv', parse_dates=['chugun_DATA_ZAMERA', 'plavka_VR_NACH', 'plavka_VR_KON'])

print('Train:', train.shape)
print('Test:', test.shape, '\n')

Train: (2061, 101)
Test: (780, 101) 

Wall time: 56 ms


# Local validation Split

In [5]:
train = train.sort_values(['plavka_NMZ', 'TST', 'C', 'chronom_time_Продувка', 'gas_T_start_povalka',
                           'gas_O2_mean',  'plavka_NAPR_ZAD'])

In [6]:
train_idx = [x for x in range(train.shape[0]) if x%5 != 2]
val_idx = [x for x in range(train.shape[0]) if x%5 == 2] 

train_ = train.iloc[train_idx]
val_ = train.iloc[val_idx]

# Модель 1 (TST)

In [7]:
class TemperatureModel():
    def __init__(self, params):
        self.params = params
        self.device = params['model_params']['task_type']
        self.model_params = params['model_params']
        self.folds = params['split_params']['folds']        
        self.train_params = params['train_params']
        self.gpu_params = params['gpu_params']
        self.cpu_params = params['cpu_params']
        self.features_params = params['features_params']
        self.observations_params = params['observations_params'] 
        self.models = []
        self.model_predictions = []
        self.predictions_summary = None        
        self.importance = []
        
    def fit(self, X, y):  
        # pprint.pprint(self.params)
        self.model_params = self.model_params | self.gpu_params if self.device == 'GPU' else \
            self.model_params | self.cpu_params   
        
        for fold in range(self.folds):
            print(f'\nFold: {fold + 1}')
            self.model_params['random_state'] += 1 
            
            if self.folds > 1:
                train_idx = [x for x in range(X.shape[0]) if x%self.folds != fold]
                val_idx = [x for x in range(X.shape[0]) if x%self.folds == fold]            
            
                X_train = X.iloc[train_idx]
                y_train = y.iloc[train_idx]
            
                X_val = X.iloc[val_idx]
                y_val = y.iloc[val_idx] 
            else:
                X_train = X
                y_train = y

                
            # TODO (weight function!)
            train_dataset = Pool(data=X_train, label=y_train, **self.features_params)                
           
            model = CatBoostRegressor(**self.model_params)    
            
            if self.folds > 1:
                eval_dataset = Pool(data=X_val, label=y_val, **self.features_params)  
                model.fit(train_dataset, eval_set=eval_dataset, **self.train_params)
            else:
                model.fit(train_dataset,  **self.train_params)
            
            self.models.append(model.copy())   

        self._feature_importance()
            
        
    def predict(self, X):        
        self.model_predictions = []
        for model_number in range(self.folds):
            model = self.models[model_number]
            self.model_predictions.append(model.predict(X))   
            
        self.predictions_summary = pd.DataFrame(data=np.array(self.model_predictions)).T
        
        self.predictions_summary['std'] = self.predictions_summary.std(axis=1)
        self.predictions_summary['predictions'] = (self.predictions_summary[[x for x in range(self.folds)]]).median(axis=1)
        self.predictions_summary['predictions2'] = (self.predictions_summary[[x for x in range(self.folds)]]).mean(axis=1) 
        self.predictions_summary['predictions_total'] = self.predictions_summary[['predictions', 'predictions2']].mean(axis=1)
                    
        return self.predictions_summary     
            
    def get_errors(self, X, y):   
        preds = self.predict(X)['predictions']
        errors = y.values - preds.values     
        return errors
    
    
    def interpretations():
        pass
    
    def uncertain_objects(std_treshold=1):
        pass    
   
    # New functions
    def _feature_importance(self):        
        
        for model_number in range(self.folds):
            fi_dict = list(zip(features, self.models[model_number].feature_importances_))
            importance_df = pd.DataFrame.from_records(fi_dict).set_index(0)
            importance_df.columns = ['importance']
            importance_df = importance_df.sort_values('importance', ascending=False)
            self.importance.append(importance_df) 
    
    def save_model(self, path):
        with open(path, 'wb') as f:
            pickle.dump(self.models, f)
    
    def load_model(self, path):
        with open(path, 'rb') as f:
            self.models = pickle.load(f)

In [8]:
target = 'TST'

features = ['plavka_NMZ', 'chugun_VES', 'gas_CO2_sum', 'chronom_time_Продувка',
            'sip_VES_VDSYP408', 'gas_T_start_povalka', 'VES/O2', 'gas_O2_pressure_mean']

print('Features:', len(features))
train[features].head()

Features: 8


Unnamed: 0,plavka_NMZ,chugun_VES,gas_CO2_sum,chronom_time_Продувка,sip_VES_VDSYP408,gas_T_start_povalka,VES/O2,gas_O2_pressure_mean
543,09Г2С,263600.0,34850.263867,1106,19940,836.458313,8.979575,15.390038
786,09Г2С,266000.0,28942.599034,1183,13410,704.513855,6.714977,15.369207
1136,09Г2С,266500.0,29945.305978,1274,11840,720.138855,8.073655,15.418046
1778,09Г2С,265900.0,30022.943028,1191,18980,742.708313,7.106362,15.98203
354,09Г2С,266200.0,32558.457053,1194,12250,718.75,8.879554,15.027119


In [9]:
# Model params

cat_features = ['plavka_NMZ'] 

split_params = {'folds': 8}

model_params = {'eval_metric': 'RMSE', 
                'objective': 'RMSE', 
                'iterations':1500,
                'learning_rate':0.05,               
                'depth': 3, 
                'l2_leaf_reg': 50,  
                'one_hot_max_size': 15,
                'task_type': 'CPU',
                'has_time': False,
                'random_state': 20210926                
               }

train_params = {'early_stopping_rounds': 150, 
                'silent': True,
                'plot': True               
               }

gpu_params = {'devices': '0',
              'data_partition': 'DocParallel',
              'max_ctr_complexity': 4,
              'bootstrap_type': 'MVS',
              'border_count': 128, 
              'fold_permutation_block': 0,
              'simple_ctr':'FeatureFreq',
              'combinations_ctr': 'FeatureFreq',      
              'ctr_target_border_count': 1,
              'random_strength': 0.95,
              'gpu_ram_part': 0.95
             }


cpu_params = {'thread_count': 14}

features_params = {'cat_features': cat_features}

observations_params = {"weight": 'not_implemented'}

params = {'split_params': split_params, 
          'model_params': model_params, 
          'train_params': train_params, 
          'gpu_params': gpu_params, 
          'cpu_params': cpu_params, 
          'features_params': features_params, 
          'observations_params': observations_params
         }

In [10]:
%%time
ml1_original = TemperatureModel(params)
ml1_original.fit(train_[features], train_[target])

preds1_original = ml1_original.predict(val_[features])

print('MAE:', np.round(((val_[target] - preds1_original['predictions'].values).abs()).mean(), 4))
print('Metric: predictions', np.round(((val_[target] - preds1_original['predictions'].values).abs() < 20).mean(), 4))
print('Metric: predictions2', np.round(((val_[target] - preds1_original['predictions2'].values).abs() < 20).mean(), 4))


Fold: 1


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 2


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 3


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 4


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 5


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 6


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 7


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 8


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

MAE: 19.1914
Metric: predictions 0.6117
Metric: predictions2 0.6068
Wall time: 25.4 s


In [11]:
ml1_original.importance[0]

Unnamed: 0_level_0,importance
0,Unnamed: 1_level_1
VES/O2,30.867514
chronom_time_Продувка,23.946289
plavka_NMZ,20.994005
gas_T_start_povalka,10.657231
gas_CO2_sum,7.009435
chugun_VES,3.84662
gas_O2_pressure_mean,1.921614
sip_VES_VDSYP408,0.757291


In [12]:
%%time
ml1 = TemperatureModel(params)
ml1.fit(train[features], train[target])

T_preds = ml1.predict(test[features])


Fold: 1


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 2


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 3


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 4


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 5


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 6


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 7


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 8


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Wall time: 37.7 s


# Модель 2 (C)
* __Примечание:__ распределение целевой переменной внутри модели преобразовывается к равномерному распределению с помощью QuantileTransformer, при прогнозировании производиться обратное преобразование

In [13]:
class CModel():
    def __init__(self, params):
        self.params = params
        self.device = params['model_params']['task_type']
        self.model_params = params['model_params']
        self.folds = params['split_params']['folds']        
        self.train_params = params['train_params']
        self.gpu_params = params['gpu_params']
        self.cpu_params = params['cpu_params']
        self.features_params = params['features_params']
        self.observations_params = params['observations_params'] 
        self.models = []
        self.model_predictions = []
        self.predictions_summary = None        
        self.importance = []
        
    def fit(self, X, y):  
        # pprint.pprint(self.params)
        self.model_params = self.model_params | self.gpu_params if self.device == 'GPU' else \
            self.model_params | self.cpu_params   
        
        for fold in range(self.folds):
            print(f'\nFold: {fold + 1}')
            self.model_params['random_state'] += 1 
            
            if self.folds > 1:
                train_idx = [x for x in range(X.shape[0]) if x%self.folds != fold]
                val_idx = [x for x in range(X.shape[0]) if x%self.folds == fold]            
            
                X_train = X.iloc[train_idx]
                y_train = y.iloc[train_idx]
            
                X_val = X.iloc[val_idx]
                y_val = y.iloc[val_idx] 
            else:
                X_train = X
                y_train = y

                
            # TODO (weight function!)
            self.transformer = QuantileTransformer(n_quantiles=1000, random_state=0)
            self.transformer.fit((y_train).values.reshape(1, -1).T)
            
            
            train_dataset = Pool(data=X_train, label=self.transformer.transform(y_train.values.reshape(1, -1).T)[:,0] , 
                                 **self.features_params)                
           
            model = CatBoostRegressor(**self.model_params)    
            
            if self.folds > 1:
                eval_dataset = Pool(data=X_val, label=self.transformer.transform(y_val.values.reshape(1, -1).T)[:,0],
                                    **self.features_params)  
                model.fit(train_dataset, eval_set=eval_dataset, **self.train_params)
            else:
                model.fit(train_dataset,  **self.train_params)
            
            self.models.append(model.copy())   

        self._feature_importance()
            
        
    def predict(self, X):        
        self.model_predictions = []
        for model_number in range(self.folds):
            model = self.models[model_number]
            self.model_predictions.append(self.transformer.inverse_transform(model.predict(X).reshape(1, -1).T)[:,0])                                     
                                    
        self.predictions_summary = pd.DataFrame(data=np.array(self.model_predictions)).T
        
        self.predictions_summary['std'] = self.predictions_summary.std(axis=1)
        self.predictions_summary['predictions'] = (self.predictions_summary[[x for x in range(self.folds)]]).median(axis=1)
        self.predictions_summary['predictions2'] = (self.predictions_summary[[x for x in range(self.folds)]]).mean(axis=1)  
                    
        return self.predictions_summary     
            
    def get_errors(self, X, y):   
        preds = self.predict(X)['predictions']
        errors = y.values - preds.values     
        return errors
    
    
    def interpretations():
        pass
    
    def uncertain_objects(std_treshold=1):
        pass    
   
    # New functions
    def _feature_importance(self):        
        
        for model_number in range(self.folds):
            fi_dict = list(zip(features, self.models[model_number].feature_importances_))
            importance_df = pd.DataFrame.from_records(fi_dict).set_index(0)
            importance_df.columns = ['importance']
            importance_df = importance_df.sort_values('importance', ascending=False)
            self.importance.append(importance_df) 
    
    def save_model(self, path):
        with open(path, 'wb') as f:
            pickle.dump(self.models, f)
    
    def load_model(self, path):
        with open(path, 'rb') as f:
            self.models = pickle.load(f)

In [14]:
train = train.sort_values(['plavka_NMZ', 'TST', 'C', 'chronom_time_Продувка',
                           'gas_O2_mean',  'plavka_NAPR_ZAD'])

In [15]:
target = 'C'
features = ['plavka_NMZ', 'chugun_VES', 'gas_CO2_sum', 'chronom_time_Продувка',
            'sip_VES_VDSYP408', 'gas_T_start_povalka', 'gas_O2_pressure_mean']
print(len(features))

7


In [16]:
cat_features = ['plavka_NMZ']

split_params = {'folds': 8}

model_params = {'eval_metric': 'RMSE', 
                'objective': 'RMSE',
                'iterations':2500,
                'learning_rate':0.05,               
                'depth': 3, 
                'l2_leaf_reg': 50,
                'one_hot_max_size': 30,
                'task_type': 'CPU',
                'has_time': False,
                'random_state': 20210926                
               }

train_params = {'early_stopping_rounds': 150, 
                'silent': True,
                'plot': True               
               }

gpu_params = {'devices': '0',
              'data_partition': 'DocParallel',
              'max_ctr_complexity': 4,
              'bootstrap_type': 'MVS',
              'border_count': 128, 
              'fold_permutation_block': 0,
              'simple_ctr':'FeatureFreq',
              'combinations_ctr': 'FeatureFreq',      
              'ctr_target_border_count': 1,
              'random_strength': 0.95,
              'gpu_ram_part': 0.95
             }


cpu_params = {'thread_count': 14}

features_params = {'cat_features': cat_features}


observations_params = {"weight": 'not_implemented'}

params = {'split_params': split_params, 
          'model_params': model_params, 
          'train_params': train_params, 
          'gpu_params': gpu_params, 
          'cpu_params': cpu_params, 
          'features_params': features_params, 
          'observations_params': observations_params
         }

In [17]:
%%time
ml2_quantile = CModel(params)
ml2_quantile.fit(train_[features], train_[target])

preds2_quantile = ml2_quantile.predict(val_[features])
print('Metric predictions:', np.round(((val_[target] - preds2_quantile['predictions'].values).abs() < 0.02).mean(), 4))
print('Metric predictions2:', np.round(((val_[target] - preds2_quantile['predictions2'].values).abs() < 0.02).mean(), 4))


Fold: 1


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 2


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 3


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 4


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 5


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 6


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 7


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 8


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Metric predictions: 0.7476
Metric predictions2: 0.7451
Wall time: 36.9 s


In [18]:
ml2_quantile.importance[0]

Unnamed: 0_level_0,importance
0,Unnamed: 1_level_1
gas_T_start_povalka,52.511669
sip_VES_VDSYP408,13.470672
gas_CO2_sum,12.901034
chronom_time_Продувка,7.856676
plavka_NMZ,5.825459
chugun_VES,4.036281
gas_O2_pressure_mean,3.398208


# Submit

In [21]:
%%time
ml = CModel(params)
ml.fit(train[features], train[target])

C_preds = ml.predict(test[features])


Fold: 1


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 2


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 3


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 4


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 5


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 6


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 7


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))


Fold: 8


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Wall time: 47.9 s


In [22]:
test['TST'] = T_preds['predictions']
test['C'] = C_preds['predictions']

test[test.columns[:3]].to_csv('data/submissions/final_model.csv')
test[test.columns[:3]]

Unnamed: 0,NPLV,TST,C
0,512324,1658.428347,0.045500
1,512327,1640.518913,0.084000
2,512328,1641.007413,0.087500
3,512331,1650.025620,0.065531
4,512333,1649.034171,0.084000
...,...,...,...
775,513369,1653.358263,0.062000
776,513370,1649.328599,0.081290
777,513371,1640.745844,0.060477
778,513372,1659.105493,0.068000
