In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.metrics import make_scorer

np.random.seed(1)

# METRICS

In [3]:
def metric(answers, user_csv):

    delta_c = np.abs(np.array(answers['C']) - np.array(user_csv['C']))
    hit_rate_c = np.int64(delta_c < 0.02)

    delta_t = np.abs(np.array(answers['TST']) - np.array(user_csv['TST']))
    hit_rate_t = np.int64(delta_t < 20)

    N = np.size(answers['C'])

    return np.sum(hit_rate_c + hit_rate_t) / 2 / N

score = make_scorer(metric, greater_is_better=False)

In [4]:
def prepare_plavki(dataset):
    
    dataset.drop_duplicates(subset=['NPLV'], keep='first', inplace=True)
    
    df = pd.DataFrame({
        'NPLV': dataset['NPLV'].values,
        'plavka_VR_NACH': dataset['plavka_VR_NACH'].values,
        'plavka_VR_KON': dataset['plavka_VR_KON'].values,
    })

    
    df['plavka_VR_NACH'] = pd.to_datetime(df['plavka_VR_NACH'])
    df['plavka_VR_KON'] = pd.to_datetime(df['plavka_VR_KON'])
    df['duration'] = df['plavka_VR_KON'] - df['plavka_VR_NACH']
    df['duration'] = df['duration'].dt.total_seconds()
    
    df_type_fur = pd.get_dummies(dataset['plavka_TIPE_FUR'], drop_first=True, prefix='type')
    df_napr = pd.get_dummies(dataset['plavka_NAPR_ZAD'], drop_first=True, prefix='napr')
    
    df = df.join(df_type_fur)
    df = df.join(df_napr)

    
    return df


def prepare_lom(dataset):
    """
        Суммирует массу всех ломов, которые были использованы
    """                
    dataset.rename(columns={"VES":"VES_loma"}, inplace=True)
    
    dataset.drop(columns=['VDL'], inplace=True)
    
    dataset = pd.get_dummies(dataset, columns=['NML'])
    
    for i in range(dataset.shape[0]):
        for col in dataset.columns:
            if dataset.loc[i, col] == 1:
                dataset.loc[i, col] = dataset.loc[i, 'VES_loma']
    
    df = dataset.groupby('NPLV').sum().reset_index()
    
    return df


def prepare_chugun(dataset):
    """
        Переводит 'DATA_ZAMERA' к pd.datetimeи переименовывает в "zamer_chuguna"
    """
    df = dataset
    df[dataset.columns[-1]] = pd.to_datetime(df[df.columns[-1]])
    df.rename(columns={df.columns[-1]: "zamer_chuguna", "VES":"VES_chuguna"}, inplace=True)
    
    return df


def prepare_dataset(plavki, lom, chugun):
    
    dataset = plavki
    dataset = dataset.join(lom.set_index('NPLV'), on='NPLV')
    dataset = dataset.join(chugun.set_index('NPLV'), on='NPLV')
    
    dataset['zamer_chuguna'] = dataset['zamer_chuguna'] - dataset['plavka_VR_NACH']
    dataset['zamer_chuguna'] = dataset['zamer_chuguna'].dt.total_seconds()
    
    dataset.drop(columns=['plavka_VR_NACH', 'plavka_VR_KON'], inplace=True)
    
    return dataset

# Data Prep

In [5]:
plavki_train = pd.read_csv('train/plavki_train.csv')
plavki_test = pd.read_csv('test/plavki_test.csv')

lom_train = pd.read_csv('train/lom_train.csv')
lom_test = pd.read_csv('test/lom_test.csv')

chugun_train = pd.read_csv('train/chugun_train.csv')
chugun_test = pd.read_csv('test/chugun_test.csv')

In [6]:
plavki_tr = prepare_plavki(plavki_train)
plavki_te = prepare_plavki(plavki_test)

lom_tr = prepare_lom(lom_train)
lom_te = prepare_lom(lom_test)

chugun_tr = prepare_chugun(chugun_train)
chugun_te = prepare_chugun(chugun_test)

In [7]:
lom_tr.drop(columns=['NML_НБ  '], inplace=True)

In [8]:
y = pd.read_csv('train/target_train.csv')

y.fillna(0, inplace=True)

In [11]:
dataset_train = prepare_dataset(plavki_tr, lom_tr, chugun_tr)
dataset_test = prepare_dataset(plavki_te, lom_te, chugun_te)

In [12]:
dataset_train.head()

Unnamed: 0,NPLV,duration,type_цилиндрическая,napr_МНЛЗ,napr_МНЛС,VES_loma,NML_25КШ,NML_К,NML_КП,NML_КШС8,...,SI,MN,S,P,CR,NI,CU,V,TI,zamer_chuguna
0,510008,2579.0,1.0,1.0,0.0,76200,0,56500,3000,0,...,0.44,0.22,0.023,0.097,0.03,0.01,0.03,0.103,0.084,412.0
1,510009,4004.0,1.0,1.0,0.0,78600,0,49800,6000,0,...,0.68,0.2,0.017,0.087,0.02,0.01,0.03,0.084,0.096,1384.0
2,510010,2904.0,1.0,0.0,0.0,76300,0,45900,2000,0,...,0.56,0.26,0.017,0.096,0.03,0.01,0.03,0.115,0.11,551.0
3,510011,3291.0,1.0,0.0,0.0,84100,0,51900,3000,0,...,0.48,0.27,0.018,0.091,0.03,0.01,0.02,0.112,0.11,429.0
4,510012,2895.0,1.0,0.0,1.0,76100,0,64000,6000,0,...,0.47,0.23,0.018,0.096,0.02,0.01,0.03,0.083,0.07,558.0


In [13]:
dataset_test.head()

Unnamed: 0,NPLV,duration,type_цилиндрическая,napr_МНЛЗ,napr_МНЛС,VES_loma,NML_25КШ,NML_К,NML_КП,NML_КШС8,...,SI,MN,S,P,CR,NI,CU,V,TI,zamer_chuguna
0,512324,3886.0,1,0,0,45700,0,26040,2000,0,...,0.46,0.33,0.027,0.079,0.01,0.01,0.02,0.048,0.03,-1107.0
1,512327,3325.0,1,0,1,71000,0,49400,3000,0,...,0.3,0.33,0.032,0.099,0.01,0.0,0.0,0.05,0.024,-3991.0
2,512328,5780.0,1,0,0,71700,0,65700,6000,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1569.0
3,512331,5252.0,1,0,0,70500,0,36000,2000,0,...,0.54,0.39,0.028,0.115,0.02,0.01,0.03,0.059,0.042,-1415.0
4,512333,3828.0,1,0,1,104500,0,40300,2000,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,305.0


In [14]:
dataset_train.isna().sum()

NPLV                    0
duration                0
type_цилиндрическая    74
napr_МНЛЗ              74
napr_МНЛС              74
VES_loma                0
NML_25КШ                0
NML_К                   0
NML_КП                  0
NML_КШС8                0
NML_ЛЧ                  0
NML_О                   0
NML_ОК                  0
NML_СК                  0
NML_У2КШ                0
VES_chuguna             0
T                       0
SI                      0
MN                      0
S                       0
P                       0
CR                      0
NI                      0
CU                      0
V                       0
TI                      0
zamer_chuguna           0
dtype: int64

In [15]:
dataset_test.isna().sum().sum()

0

In [16]:
dataset_train['type_цилиндрическая'].value_counts()

1.0    1874
0.0     115
Name: type_цилиндрическая, dtype: int64

In [17]:
dataset_train['napr_МНЛЗ'].value_counts()

0.0    1455
1.0     534
Name: napr_МНЛЗ, dtype: int64

In [18]:
dataset_train['napr_МНЛС'].value_counts()

0.0    1082
1.0     907
Name: napr_МНЛС, dtype: int64

In [19]:
dataset_train['type_цилиндрическая'].fillna(1, inplace=True)
dataset_train['napr_МНЛЗ'].fillna(0, inplace=True)
dataset_train['napr_МНЛС'].fillna(0, inplace=True)

# Model training

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(dataset_train, y, test_size=0.2, random_state=0)

### RandomForest

In [22]:
from sklearn.ensemble import RandomForestRegressor

###### Temperature

In [23]:
regressor_t = RandomForestRegressor(max_depth=8, max_leaf_nodes=12, n_estimators=100, n_jobs=-1)
regressor_t.fit(X_train, y_train.loc[:, 'TST'])

RandomForestRegressor(max_depth=8, max_leaf_nodes=12, n_jobs=-1)

###### C

In [24]:
regressor_с = RandomForestRegressor(max_depth=8, max_leaf_nodes=12, n_estimators=100, n_jobs=-1)
regressor_с.fit(X_train, y_train.loc[:, 'C'])

RandomForestRegressor(max_depth=8, max_leaf_nodes=12, n_jobs=-1)

###### Metrics

In [25]:
tr = pd.DataFrame()

tr['TST'] = regressor_t.predict(X_train)
tr['C'] = regressor_с.predict(X_train)

metric(y_train, tr)

0.48424242424242425

In [26]:
te = pd.DataFrame()

te['TST'] = regressor_t.predict(X_test)
te['C'] = regressor_с.predict(X_test)

metric(y_test, te)

0.48062953995157387

### DTree

In [27]:
from sklearn.tree import DecisionTreeRegressor

###### Temperature

In [28]:
reg_t = DecisionTreeRegressor(max_depth=8,
                              max_leaf_nodes=16,
                              min_samples_leaf=5,
                              random_state=0)

reg_t.fit(X_train, y_train.loc[:,'TST'])

DecisionTreeRegressor(max_depth=8, max_leaf_nodes=16, min_samples_leaf=5,
                      random_state=0)

###### C

In [29]:
reg_c = DecisionTreeRegressor(max_depth=8,
                              max_leaf_nodes=16,
                              min_samples_leaf=5,
                              random_state=0)

reg_c.fit(X_train, y_train.loc[:,'C'])

DecisionTreeRegressor(max_depth=8, max_leaf_nodes=16, min_samples_leaf=5,
                      random_state=0)

###### Metrics

In [30]:
tr_dtree = pd.DataFrame()

tr_dtree['TST'] = reg_t.predict(X_train)
tr_dtree['C'] = reg_c.predict(X_train)

metric(y_train, tr_dtree)

0.4951515151515152

In [31]:
te_dtree = pd.DataFrame()

te_dtree['TST'] = reg_t.predict(X_test)
te_dtree['C'] = reg_c.predict(X_test)

metric(y_test, te_dtree)

0.45036319612590797

###### Results

In [32]:
# res = pd.DataFrame()

# res['NPLV'] = dataset_test['NPLV']

# res['TST'] = regressor_t.predict(dataset_test)
# res['C'] = regressor_с.predict(dataset_test)

res = pd.DataFrame()

res['NPLV'] = dataset_test['NPLV']

res['TST'] = reg_t.predict(dataset_test)
res['C'] = reg_c.predict(dataset_test)

In [33]:
res.to_csv('result_dtree_4.csv', index=False)