In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import RandomizedSearchCV

In [6]:
lom_train = pd.read_csv("data/lom_train.csv")
lom_test = pd.read_csv("data/lom_test.csv")

In [7]:
sip_train = pd.read_csv("data/sip_train.csv")
sip_test = pd.read_csv("data/sip_test.csv")

In [8]:
def get_sip_or_lom_features(lom_df, types, type_column='VDL', ves_column='VES'):
    df = lom_df.sort_values('NPLV').copy()
    ids = df['NPLV'].unique()
    num_ids = len(ids)
    dict_to_df = {type_column + '_' + str(lom_type) + '_VES': np.zeros(num_ids) for lom_type in types}
    dict_to_df['NPLV'] = ids
    k = 0
    nplv_last = df.iloc[0]['NPLV']
    for i in range(df.shape[0]):
        nplv_next = df.iloc[i]['NPLV']
        if nplv_next != nplv_last:
            k+=1
        lom_type = df.iloc[i][type_column]
        ves = df.iloc[i][ves_column]
        column_name = type_column+'_'+str(lom_type)+'_VES'
        if column_name in dict_to_df.keys():
            dict_to_df[column_name][k] += ves
        nplv_last = nplv_next
    result = pd.DataFrame.from_dict(dict_to_df)
    return result

In [13]:
%%time

# Подготовим фичи по файлам lom, sip
features_train_lom = get_sip_or_lom_features(lom_train, lom_train['VDL'].unique())
features_train_sip = get_sip_or_lom_features(sip_train, sip_train['VDSYP'].unique(), 
                                         type_column='VDSYP', ves_column='VSSYP')

features_test_lom = get_sip_or_lom_features(lom_test, lom_train['VDL'].unique())
features_test_sip = get_sip_or_lom_features(sip_test, sip_train['VDSYP'].unique(), 
                                         type_column='VDSYP', ves_column='VSSYP')


CPU times: user 15 s, sys: 3.23 ms, total: 15 s
Wall time: 15 s


In [14]:
features_train_lom.to_csv("data/features_train_lom.csv", index=False)
features_test_lom.to_csv("data/features_test_lom.csv", index=False)
features_train_sip.to_csv("data/features_train_sip.csv", index=False)
features_test_sip.to_csv("data/features_test_sip.csv", index=False)

In [15]:
produv_train = pd.read_csv("data/produv_train.csv")
produv_test = pd.read_csv("data/produv_test.csv")

In [16]:
chugun_train = pd.read_csv("data/chugun_train.csv")
chugun_test = pd.read_csv("data/chugun_test.csv")

In [17]:
sip_train = pd.read_csv("data/features_train_sip.csv")
sip_test = pd.read_csv("data/features_test_sip.csv")

In [18]:
lom_train = pd.read_csv("data/features_train_lom.csv")
lom_test = pd.read_csv("data/features_test_lom.csv")

In [19]:
plavka_train = pd.read_csv("data/plavki_train.csv")
plavka_test = pd.read_csv("data/plavki_test.csv")

In [20]:
target_train = pd.read_csv("data/target_train.csv")
target_test = pd.read_csv("data/sample_submission.csv")

In [21]:
%%time
gas_train = pd.read_csv("data/gas_train.csv")
gas_test = pd.read_csv("data/gas_test.csv")

CPU times: user 18.4 s, sys: 1.08 s, total: 19.5 s
Wall time: 21.7 s


In [22]:
def gas_param_endofprocess(gas_df, param="O2"):
    
    gas_df['Time'] = pd.to_datetime(gas_df['Time'])
    gas_df.set_index('Time')
    
    df = pd.DataFrame(columns=['NPLV', param + "_endofprocess"])
    i = 0
    for nplv in gas_df['NPLV'].unique():
        gas_df_tmp = gas_df[gas_df['NPLV'] == nplv].copy()
        param_endofprocess_median = gas_df_tmp[-600:][param].median()
        df.loc[i, ['NPLV', param + "_endofprocess"]] = [nplv, param_endofprocess_median]
        i += 1
    return df

In [23]:
def gas_param_startofprocess(gas_df, param="N2"):
    
    gas_df['Time'] = pd.to_datetime(gas_df['Time'])
    gas_df.set_index('Time')
    
    df = pd.DataFrame(columns=['NPLV', param + "_endofprocess"])
    i = 0
    for nplv in gas_df['NPLV'].unique():
        gas_df_tmp = gas_df[gas_df['NPLV'] == nplv].copy()
        param_endofprocess_median = gas_df_tmp[:600][param].median()
        df.loc[i, ['NPLV', param + "_endofprocess"]] = [nplv, param_endofprocess_median]
        i += 1
    return df

In [24]:
def preprocess_df(produv_df, chugun_df, sip_df, lom_df, plavka_df, gas_df, target_df, is_df_train=True, df_train=None):
    
    produv_df['SEC'] = pd.to_datetime(produv_df['SEC'])
    produv_df = produv_df.groupby('NPLV', as_index=False)['SEC'].min()

    chugun_df['DATA_ZAMERA'] = pd.to_datetime(chugun_df['DATA_ZAMERA'])    

    
    plavka_df['plavka_VR_NACH'] = pd.to_datetime(plavka_df['plavka_VR_NACH'])
    plavka_df['plavka_VR_KON'] = pd.to_datetime(plavka_df['plavka_VR_KON'])
    plavka_df['plavka_duration'] = (plavka_df['plavka_VR_KON'] - plavka_df['plavka_VR_NACH']).dt.total_seconds()

    plavka_TIPE_FUR = pd.get_dummies(plavka_df['plavka_TIPE_FUR'], drop_first=True, prefix='plavka_TIPE_FUR')
    plavka_df = pd.concat([plavka_df, plavka_TIPE_FUR], axis=1)

    plavka_df = plavka_df[['NPLV', 'plavka_duration', 'plavka_NMZ', 'plavka_TIPE_FUR_цилиндрическая']]
    
    gas_df_tmp = gas_df.groupby("NPLV", as_index=False)[['O2']].median()
    

    gas_df_tmp['T_q95'] = gas_df.groupby("NPLV", as_index=False)[['T']].quantile(q=0.95)['T']
    
    gas_df_tmp = pd.merge(gas_df_tmp, gas_param_endofprocess(gas_df, param='O2'))


    
    df_tmp = pd.merge(produv_df, chugun_df, on='NPLV')
    df_tmp = pd.merge(df_tmp, sip_df, on='NPLV')
    df_tmp = pd.merge(df_tmp, lom_df, on='NPLV')
    df_tmp = pd.merge(df_tmp, plavka_df, on='NPLV')
    df_tmp = pd.merge(df_tmp, target_df, on='NPLV')
    df_tmp = pd.merge(df_tmp, gas_df_tmp, on='NPLV')

    df_tmp['produv_startdata_chugun_datazamera_delta'] = (df_tmp['SEC'] - df_tmp['DATA_ZAMERA']).dt.total_seconds()

    df_tmp.drop(['SEC', 'DATA_ZAMERA'], axis=1, inplace=True)
    
    if is_df_train:
        
        df_tmp['median_C_per_plavka_NMZ'] = df_tmp['C'].median()
        df_tmp['median_TST_per_plavka_NMZ'] = df_tmp['TST'].median()
        


        median_C_TST_per_plavka_NMZ = df_tmp.groupby('plavka_NMZ')[['C', 'TST']].median().to_dict()
        
        for i in range(len(df_tmp)):
            if df_tmp.loc[i, 'plavka_NMZ'] in median_C_TST_per_plavka_NMZ['C'].keys():
                mark = df_tmp.loc[i, 'plavka_NMZ']
                df_tmp.loc[i, 'median_C_per_plavka_NMZ'] = median_C_TST_per_plavka_NMZ['C'][mark]
                df_tmp.loc[i, 'median_TST_per_plavka_NMZ'] = median_C_TST_per_plavka_NMZ['TST'][mark]

        
    else:
        df_tmp['median_C_per_plavka_NMZ'] = df_train['C'].median()
        df_tmp['median_TST_per_plavka_NMZ'] = df_train['TST'].median()


        median_C_TST_per_plavka_NMZ = df_train.groupby('plavka_NMZ')[['C', 'TST']].median().to_dict()
        
        for i in range(len(df_tmp)):
            if df_tmp.loc[i, 'plavka_NMZ'] in median_C_TST_per_plavka_NMZ['C'].keys():
                mark = df_tmp.loc[i, 'plavka_NMZ']
                df_tmp.loc[i, 'median_C_per_plavka_NMZ'] = median_C_TST_per_plavka_NMZ['C'][mark]
                df_tmp.loc[i, 'median_TST_per_plavka_NMZ'] = median_C_TST_per_plavka_NMZ['TST'][mark]

                
    return df_tmp

In [25]:
#train, validation

In [26]:
%%time
df = preprocess_df(produv_train, chugun_train, sip_train, lom_train, plavka_train, gas_train, target_train)

CPU times: user 46.7 s, sys: 700 ms, total: 47.4 s
Wall time: 15.7 s


In [28]:
df.head().T

Unnamed: 0,0,1,2,3,4
NPLV,510008,510009,510010,510011,510012
VES,263700.0,264500.0,263800.0,264000.0,263300.0
T,1396.0,1419.0,1384.0,1401.0,1422.0
SI,0.44,0.68,0.56,0.48,0.47
MN,0.22,0.2,0.26,0.27,0.23
S,0.023,0.017,0.017,0.018,0.018
P,0.097,0.087,0.096,0.091,0.096
CR,0.03,0.02,0.03,0.03,0.02
NI,0.01,0.01,0.01,0.01,0.01
CU,0.03,0.03,0.03,0.02,0.03


In [29]:
df = df.dropna()
df.index = range(len(df))

In [30]:
X = df.drop(labels=['NPLV', 'C', 'TST', 'plavka_NMZ'], axis=1)
y = df[['C', 'TST']]

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.36, random_state=42)

In [32]:
rf_grid = {
    "estimator__n_estimators": np.arange(10, 60, 10),
    "estimator__max_depth": [3, 5, 7, 9],
    "estimator__min_samples_leaf": np.arange(20, 100, 20),
    "estimator__min_samples_split": np.arange(40, 100, 20),
    "estimator__max_features": ["auto", "sqrt", "log2"]
    
}

In [33]:
%%time
rf_reg = RandomForestRegressor(n_jobs=-1, random_state=42)

rs_rf_model = RandomizedSearchCV(
    MultiOutputRegressor(rf_reg),
    rf_grid,
    n_iter=10,
    cv=5,
    verbose=True,
    n_jobs=-1,
    random_state=42

)

rs_rf_model.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.5s


CPU times: user 223 ms, sys: 145 ms, total: 368 ms
Wall time: 2.97 s


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    2.8s finished


RandomizedSearchCV(cv=5,
                   estimator=MultiOutputRegressor(estimator=RandomForestRegressor(n_jobs=-1,
                                                                                  random_state=42)),
                   n_jobs=-1,
                   param_distributions={'estimator__max_depth': [3, 5, 7, 9],
                                        'estimator__max_features': ['auto',
                                                                    'sqrt',
                                                                    'log2'],
                                        'estimator__min_samples_leaf': array([20, 40, 60, 80]),
                                        'estimator__min_samples_split': array([40, 60, 80]),
                                        'estimator__n_estimators': array([10, 20, 30, 40, 50])},
                   random_state=42, verbose=True)

In [34]:
rs_rf_model.best_params_

{'estimator__n_estimators': 40,
 'estimator__min_samples_split': 40,
 'estimator__min_samples_leaf': 40,
 'estimator__max_features': 'auto',
 'estimator__max_depth': 5}

In [35]:
y_train_pred = pd.DataFrame(rs_rf_model.predict(X_train), columns=['C', 'TST'])

In [36]:
y_test_pred = pd.DataFrame(rs_rf_model.predict(X_test), columns=['C', 'TST'])

In [37]:
def metric(answers, user_csv):

    delta_c = np.abs(np.array(answers['C']) - np.array(user_csv['C']))
    hit_rate_c = np.int64(delta_c < 0.02)

    delta_t = np.abs(np.array(answers['TST']) - np.array(user_csv['TST']))
    hit_rate_t = np.int64(delta_t < 20)

    N = np.size(answers['C'])
    
    return np.sum(hit_rate_c + hit_rate_t) / 2 / N    

In [38]:
metric(y_train, y_train_pred)

0.6625183016105417

In [39]:
metric(y_test, y_test_pred)

0.6241872561768531

In [40]:
rs_rf_model.score(X_train, y_train)

0.432672256564414

In [41]:
rs_rf_model.score(X_test, y_test)

0.39191481321015953

In [42]:
#train, test

In [43]:
%%time
df = preprocess_df(produv_train, chugun_train, sip_train, lom_train, plavka_train, gas_train, target_train)
df = df.dropna()
df.index = range(len(df))

CPU times: user 44.5 s, sys: 405 ms, total: 44.9 s
Wall time: 13 s


In [44]:
X_train = df.drop(labels=['NPLV', 'C', 'TST', 'plavka_NMZ'], axis=1)
y_train = df[['C', 'TST']]

In [45]:
rs_rf_model.best_params_

{'estimator__n_estimators': 40,
 'estimator__min_samples_split': 40,
 'estimator__min_samples_leaf': 40,
 'estimator__max_features': 'auto',
 'estimator__max_depth': 5}

In [46]:
%%time
rf_reg = MultiOutputRegressor(
    RandomForestRegressor(
        n_jobs=-1, 
        random_state=42, 
        n_estimators = rs_rf_model.best_params_['estimator__n_estimators'],
        min_samples_split = rs_rf_model.best_params_['estimator__min_samples_split'],
        min_samples_leaf = rs_rf_model.best_params_['estimator__min_samples_leaf'],
        max_features = rs_rf_model.best_params_['estimator__max_features'],
        max_depth = rs_rf_model.best_params_['estimator__max_depth']
    )
)
rf_reg.fit(X_train, y_train)

CPU times: user 88.1 ms, sys: 8.11 ms, total: 96.2 ms
Wall time: 171 ms


MultiOutputRegressor(estimator=RandomForestRegressor(max_depth=5,
                                                     min_samples_leaf=40,
                                                     min_samples_split=40,
                                                     n_estimators=40, n_jobs=-1,
                                                     random_state=42))

In [47]:
y_train_pred = pd.DataFrame(rf_reg.predict(X_train), columns=['C', 'TST'])
metric(y_train, y_train_pred)

0.6618266978922717

In [48]:
rf_reg.score(X_train, y_train)

0.5231867870502924

In [49]:
%%time
df_test = preprocess_df(
    produv_test, chugun_test, sip_test, lom_test, plavka_test, gas_test, target_test, is_df_train=False, df_train=df)


CPU times: user 8.51 s, sys: 236 ms, total: 8.75 s
Wall time: 3.84 s


In [50]:
X_test = df_test.drop(labels=['NPLV', 'C', 'TST', 'plavka_NMZ'], axis=1)
y_test = df_test[['C', 'TST']]

In [51]:
y_test_pred = pd.DataFrame(rf_reg.predict(X_test), columns=['C', 'TST'])

In [52]:
y_test_pred['NPLV'] = df_test['NPLV']

In [53]:
y_test_pred[['NPLV', 'TST', 'C']].to_csv("predictions.csv", index=False)