In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import make_scorer

from sklearn.model_selection import train_test_split

from random import seed
seed(777)

In [3]:
def metric(answers, user_csv):

    delta_c = np.abs(np.array(answers['C']) - np.array(user_csv['C']))
    hit_rate_c = np.int64(delta_c < 0.02)

    delta_t = np.abs(np.array(answers['TST']) - np.array(user_csv['TST']))
    hit_rate_t = np.int64(delta_t < 20)

    N = np.size(answers['C'])

    return np.sum(hit_rate_c + hit_rate_t) / 2 / N

score = make_scorer(metric, greater_is_better=False)

In [43]:
def prepare_plavki(dataset):
    """
        Удаляем все признаки, кроме времени. 
        Переводим в pd.datetime и вычисляем продолжительность в секундах
    """
    df = dataset.copy()
    
    df.drop(columns=['plavka_NMZ', 'plavka_TIPE_FUR', 'plavka_NAPR_ZAD', 'plavka_STFUT',
                     'plavka_ST_FURM', 'plavka_TIPE_GOL', 'plavka_ST_GOL'], inplace=True)
    
    df.drop_duplicates(subset=['NPLV'], keep='first', inplace=True)
    
    df['plavka_VR_NACH'] = pd.to_datetime(df['plavka_VR_NACH'])
    df['plavka_VR_KON'] = pd.to_datetime(df['plavka_VR_KON'])
    df['duration'] = df['plavka_VR_KON'] - df['plavka_VR_NACH']
    df['duration'] = df['duration'].dt.total_seconds()
    
    return df


def prepare_lom(dataset):
    """
        Суммирует массу всех ломов, которые были использованы
    """                
    dataset.rename(columns={"VES":"VES_loma"}, inplace=True)
    
    return dataset.drop(columns='VDL').groupby('NPLV').sum().reset_index()


def prepare_chugun(dataset):
    """
        Переводит 'DATA_ZAMERA' к pd.datetimeи переименовывает в "zamer_chuguna"
    """
    df = dataset
    df[dataset.columns[-1]] = pd.to_datetime(df[df.columns[-1]])
    df.rename(columns={df.columns[-1]: "zamer_chuguna", "VES":"VES_chuguna"}, inplace=True)
    
    return df


def prepare_sip(dataset):
    """
        Суммирует массу, всех входящих элементов 
    """
    df_sip = pd.DataFrame(index=dataset['NPLV'].unique(),
                          columns=['VES_sip']).reset_index()

    df_sip.rename(columns={"index": "NPLV"}, inplace=True)

    for num in dataset['NPLV'].unique():
        df_sip['VES_sip'].loc[df_sip['NPLV'] == num] = \
        dataset.loc[dataset['NPLV'] == num]['VSSYP'].sum()
   
    return df_sip


def prepare_dataset(plavki, lom, chugun, sip):
    dataset = plavki.copy()
    dataset = dataset.join(lom.set_index('NPLV'), on='NPLV')
    dataset = dataset.join(chugun.set_index('NPLV'), on='NPLV')
    dataset = dataset.join(sip.set_index('NPLV'), on='NPLV')    
    
    dataset['zamer_chuguna'] = dataset['zamer_chuguna'] - dataset['plavka_VR_NACH']
    dataset['zamer_chuguna'] = dataset['zamer_chuguna'].dt.total_seconds()
    
    dataset.drop(columns=['plavka_VR_NACH', 'plavka_VR_KON'], inplace=True)
    
    return dataset

In [12]:
plavki_train = pd.read_csv('train/plavki_train.csv')
plavki_test = pd.read_csv('test/plavki_test.csv')

lom_train = pd.read_csv('train/lom_train.csv')
lom_test = pd.read_csv('test/lom_test.csv')

chugun_train = pd.read_csv('train/chugun_train.csv')
chugun_test = pd.read_csv('test/chugun_test.csv')

sip_train = pd.read_csv('train/sip_train.csv')
sip_test = pd.read_csv('test/sip_test.csv')

In [44]:
plavki_tr = prepare_plavki(plavki_train)
plavki_te = prepare_plavki(plavki_test)

lom_tr = prepare_lom(lom_train)
lom_te = prepare_lom(lom_test)

chugun_tr = prepare_chugun(chugun_train)
chugun_te = prepare_chugun(chugun_test)

sip_tr = prepare_sip(sip_train)
sip_te = prepare_sip(sip_test)

In [45]:
dataset_train = prepare_dataset(plavki_tr, lom_tr, chugun_tr, sip_tr)
dataset_test = prepare_dataset(plavki_te, lom_te, chugun_te, sip_te)

In [46]:
dataset_train.shape, dataset_test.shape

((2063, 16), (780, 16))

In [47]:
dataset_train.head()

Unnamed: 0,NPLV,duration,VES_loma,VES_chuguna,T,SI,MN,S,P,CR,NI,CU,V,TI,zamer_chuguna,VES_sip
0,510008,2579.0,76200,263700.0,1396.0,0.44,0.22,0.023,0.097,0.03,0.01,0.03,0.103,0.084,412.0,20970
1,510009,4004.0,78600,264500.0,1419.0,0.68,0.2,0.017,0.087,0.02,0.01,0.03,0.084,0.096,1384.0,23780
2,510010,2904.0,76300,263800.0,1384.0,0.56,0.26,0.017,0.096,0.03,0.01,0.03,0.115,0.11,551.0,24070
3,510011,3291.0,84100,264000.0,1401.0,0.48,0.27,0.018,0.091,0.03,0.01,0.02,0.112,0.11,429.0,27300
4,510012,2895.0,76100,263300.0,1422.0,0.47,0.23,0.018,0.096,0.02,0.01,0.03,0.083,0.07,558.0,28540


In [48]:
dataset_test.head()

Unnamed: 0,NPLV,duration,VES_loma,VES_chuguna,T,SI,MN,S,P,CR,NI,CU,V,TI,zamer_chuguna,VES_sip
0,512324,3886.0,45700,240100.0,1355.0,0.46,0.33,0.027,0.079,0.01,0.01,0.02,0.048,0.03,-1107.0,16940
1,512327,3325.0,71000,266400.0,1390.0,0.3,0.33,0.032,0.099,0.01,0.0,0.0,0.05,0.024,-3991.0,15340
2,512328,5780.0,71700,270200.0,1373.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1569.0,21410
3,512331,5252.0,70500,266700.0,1383.0,0.54,0.39,0.028,0.115,0.02,0.01,0.03,0.059,0.042,-1415.0,14990
4,512333,3828.0,104500,267400.0,1387.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,305.0,21060


In [49]:
dataset_train.to_csv('train.csv', index=False)

In [50]:
dataset_test.to_csv('test.csv', index=False)

In [51]:
y = pd.read_csv('train/target_train.csv')

y.fillna(0, inplace=True)

In [52]:
X_train, X_test, y_train, y_test = train_test_split(dataset_train, y, test_size=0.2, random_state=0)

In [53]:
regressor_c = RandomForestRegressor(random_state=0, max_depth=10, max_leaf_nodes=6)
regressor_t = RandomForestRegressor(random_state=0, max_depth=10, max_leaf_nodes=6)

In [54]:
regressor_t.fit(X_train, y_train.loc[:,'TST'])

RandomForestRegressor(max_depth=10, max_leaf_nodes=6, random_state=0)

In [55]:
regressor_c.fit(X_train, y_train.loc[:,'C'])

RandomForestRegressor(max_depth=10, max_leaf_nodes=6, random_state=0)

In [56]:
tr = pd.DataFrame()

tr['TST'] = regressor_t.predict(X_train)
tr['C'] = regressor_c.predict(X_train)

In [57]:
metric(y_train, tr)

0.4766666666666667

In [58]:
te = pd.DataFrame()

te['TST'] = regressor_t.predict(X_test)
te['C'] = regressor_c.predict(X_test)

In [59]:
metric(y_test, te)

0.4794188861985472

In [60]:
res = pd.DataFrame()

res['NPLV'] = dataset_test['NPLV'].copy()

res['TST'] = regressor_t.predict(dataset_test)
res['C'] = regressor_c.predict(dataset_test)

In [61]:
res.to_csv('result.csv', index=False)