In [4]:
import pandas as pd
import numpy as np
import xgboost as xgb
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix

from torchvision.transforms import transforms
from torch import nn
from torch.autograd import Variable
import torch

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Data Load

In [5]:
origin_train = pd.read_csv("train.csv")
origin_test = pd.read_csv('test.csv')
print(list(origin_train.columns))

['date_time', 'deg_C', 'relative_humidity', 'absolute_humidity', 'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']


#function

In [6]:
def show_corr(df):
    corr_matrix = df.corr()[['target_carbon_monoxide','target_benzene','target_nitrogen_oxides']].T
    corr_matrix = corr_matrix.drop(columns =['target_carbon_monoxide','target_benzene','target_nitrogen_oxides'],axis = 1 )
    plt.matshow(np.array(corr_matrix))
    plt.colorbar()
    plt.show()
    print(corr_matrix)

def df_to_x_y(df):
    y_train = df[['target_carbon_monoxide','target_benzene','target_nitrogen_oxides']]
    x_train = df.drop(columns= ['target_carbon_monoxide','target_benzene','target_nitrogen_oxides'],axis = 1)
    x_train, x_valid , y_train,y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=23)
    return x_train,y_train, x_valid, y_valid

def model_fit_kfold_grid_search (x_train,y_train,x_valid, y_valid):

    y_train_carbon_monoxide = y_train['target_carbon_monoxide']
    y_train_target_benzene = y_train['target_benzene']
    y_train_target_nitrogen_oxides = y_train['target_nitrogen_oxides']

    

    parameter = dict(n_estimators=100,
    objective='reg:squarederror',booster='gbtree',n_jobs=5,
    nthread=None,gamma=0, min_child_weight=1, max_delta_step=0,
    subsample=0.8, colsample_bytree=0.8)

    grid_searchs_parameter = {
              'max_depth': [i for i in range(7)], # 
              'learning_rate': [0.005,0.01, 0.02, 0.05, 0.1]
    }

    model_c = GridSearchCV(xgb.XGBRegressor(**parameter),param_grid= grid_searchs_parameter,cv= 3)
    model_t = GridSearchCV(xgb.XGBRegressor(**parameter),param_grid= grid_searchs_parameter,cv= 3)
    model_n = GridSearchCV(xgb.XGBRegressor(**parameter),param_grid= grid_searchs_parameter,cv= 3)


    model_c.fit(x_train,y_train_carbon_monoxide)
    model_t.fit(x_train,y_train_target_benzene)
    model_n.fit(x_train,y_train_target_nitrogen_oxides)

    result_c = model_c.predict(x_valid)
    result_t = model_t.predict(x_valid)
    result_n = model_n.predict(x_valid)

    max_depth_list = []
    learning_rate_list = []

    for i in [model_c,model_t,model_n]:
        best_parameters = i.best_estimator_.get_params()
        print(f'\n model below ')
        for param_name in sorted(grid_searchs_parameter.keys()):
            print(f"\n {(param_name, best_parameters[param_name])}")
            if param_name =='max_depth':
                max_depth_list.append(best_parameters[param_name])
            elif param_name =='learning_rate':
                learning_rate_list.append(best_parameters[param_name])
    return max_depth_list, learning_rate_list


def model_fit_kfold (x_train,y_train,x_valid, y_valid , max_depth_list = None , learning_rate_list = None ,type = 'train'):

    y_train_carbon_monoxide = y_train['target_carbon_monoxide']
    y_train_target_benzene = y_train['target_benzene']
    y_train_target_nitrogen_oxides = y_train['target_nitrogen_oxides']

    y_valid_carbon_monoxide  = y_valid['target_carbon_monoxide']
    y_valid_target_benzene  = y_valid['target_benzene']
    y_valid_target_nitrogen_oxides  = y_valid['target_nitrogen_oxides']

    parameter = dict(n_estimators=100,
    objective='reg:squarederror',booster='gbtree',n_jobs=10,
    nthread=None,gamma=0, min_child_weight=1, max_delta_step=0,
    subsample=0.8, colsample_bytree=0.8)

    if max_depth_list == None and learning_rate_list == None :
        model_c = xgb.XGBRegressor(**parameter)
        model_t = xgb.XGBRegressor(**parameter)
        model_n = xgb.XGBRegressor(**parameter)
    else :
        model_c = xgb.XGBRegressor(max_depth = max_depth_list[0],learning_rate = learning_rate_list[0] ,**parameter)
        model_t = xgb.XGBRegressor(max_depth = max_depth_list[1],learning_rate = learning_rate_list[1] ,**parameter)
        model_n = xgb.XGBRegressor(max_depth = max_depth_list[2],learning_rate = learning_rate_list[2] ,**parameter)

    fit_parameter = dict(early_stopping_rounds = 15, eval_metric = ['rmsle'])
    model_c.fit(x_train,y_train_carbon_monoxide,eval_set = [(x_valid, y_valid_carbon_monoxide)],**fit_parameter)
    model_t.fit(x_train,y_train_target_benzene,eval_set = [(x_valid, y_valid_target_benzene)],**fit_parameter)
    model_n.fit(x_train,y_train_target_nitrogen_oxides,eval_set = [(x_valid, y_valid_target_nitrogen_oxides)],**fit_parameter)

    result_c = model_c.predict(x_valid)
    result_t = model_t.predict(x_valid)
    result_n = model_n.predict(x_valid)

    mse_c = MSE(y_valid.iloc[:,0],result_c)
    mse_t = MSE(y_valid.iloc[:,1],result_t)
    mse_n = MSE(y_valid.iloc[:,2],result_n)

    if type == 'train':
        plt.figure(figsize=(20,6))
        plt.plot(x_valid.columns,model_c.feature_importances_)
        plt.plot(x_valid.columns,model_t.feature_importances_)
        plt.plot(x_valid.columns,model_n.feature_importances_)
        print(model_n.feature_importances_.sum())
        plt.show()

    if type == 'test':
        return  model_c ,model_t ,model_n

In [11]:
none_train = origin_train.copy()
print(len(none_train))
for i in  range(1,len(none_train.columns)):
    target = none_train.iloc[:,i]
    scope = target.mean()+(target.std()*2)
    column_name = none_train.columns[i]
    none_train = none_train[none_train[column_name]<float(scope)]
    print(f'{none_train.columns[i]}:{len(none_train)}')
clean_df = none_train.copy()

7111
deg_C:6873
relative_humidity:6775
absolute_humidity:6638
sensor_1:6368
sensor_2:6200
sensor_3:5846
sensor_4:5706
sensor_5:5513
target_carbon_monoxide:5276
target_benzene:5082
target_nitrogen_oxides:4809


## data engine

In [12]:
def feature_engine(df):
    df = df.reset_index(drop = True)
    # datetime_processing
    datatime = df['date_time']
    df['hour'] = pd.to_datetime(df['date_time']).dt.hour
    df['hour'] = abs(df['hour']-12)/12

    df['month'] = pd.to_datetime(df['date_time']).dt.month
    df['month'] = abs(df['hour']-6)/6

    df = df.drop(['date_time'],axis = 1)
    # mean of senser
    df['1345_mean'] = round((df['sensor_1'] + df['sensor_3']+ df['sensor_4'] + df['sensor_5'])/4,1)

    # adjust mean_deg_c
    gap = 1

    mean_deg_c = df['deg_C'].copy()
    change = 0
    for i in range(1,len(mean_deg_c)-1):
        #increase too much
        if mean_deg_c[i-1] - mean_deg_c[i] >  gap:
            mean_deg_c[i] = mean_deg_c[i-1] - gap
            change +=1
        #decrease too much
        if mean_deg_c[i-1] - mean_deg_c[i] <  gap*-1:
            mean_deg_c[i] = mean_deg_c[i-1] + gap
            change +=1
    df['mean_deg_c'] = mean_deg_c
    df = df.drop(['deg_C'],axis = 1)
    df = pd.DataFrame(df)
    return df ,datatime

In [13]:
df = clean_df.copy()
df ,datatime = feature_engine(df)
x_train,y_train,x_valid, y_valid = df_to_x_y(df)
x_train.head()

Unnamed: 0,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,hour,month,1345_mean,mean_deg_c
1737,32.2,1.1605,1071.4,1066.8,775.7,1734.7,1160.9,0.833333,0.861111,1185.7,25.0
2542,48.2,1.4337,1096.9,947.0,724.0,1775.7,1168.0,0.416667,0.930556,1191.2,24.8
2937,28.6,1.277,819.1,613.8,1108.8,1402.8,427.2,0.083333,0.986111,939.5,30.0
952,41.2,1.0324,1056.6,1073.8,891.4,1558.7,762.3,0.5,0.916667,1067.2,21.9
337,46.9,0.628,898.6,547.0,1496.0,1176.1,486.9,0.666667,0.888889,1014.4,10.7


# LSTM

In [73]:
class RNN(nn.Module):
    def __init__(self, input_size=1, hidden_layer_size=100, output_size=1):
        super().__init__()
        self.hidden_layer_size = hidden_layer_size
        self.lstm = nn.LSTM(input_size, hidden_layer_size)
        self.linear = nn.Linear(hidden_layer_size, hidden_layer_size)
        self.linear = nn.Linear(hidden_layer_size, hidden_layer_size)
        self.linear = nn.Linear(hidden_layer_size, output_size)
        self.hidden_cell = (torch.zeros(1,1,self.hidden_layer_size),
                            torch.zeros(1,1,self.hidden_layer_size))

    def forward(self, input_seq):
        lstm_out, self.hidden_cell = self.lstm(input_seq.view(len(input_seq) ,1, -1), self.hidden_cell)
        predictions = self.linear(lstm_out.view(len(input_seq), -1))
        return predictions[-1]

class RMSLELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()

    def forward(self, pred, actual):
        return torch.sqrt(self.mse(torch.log(pred + 1), torch.log(actual + 1)))


In [74]:
model = RNN()
loss_function = RMSLELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
epochs = 50
print(model)

RNN(
  (lstm): LSTM(1, 100)
  (linear): Linear(in_features=100, out_features=1, bias=True)
)


In [75]:
print(torch.tensor(x_train.values).shape)
print(torch.tensor(y_train.values).shape)
x_train = torch.tensor(x_train.values).to(torch.float32)
y_train = torch.tensor(y_train.values).to(torch.float32)
print(torch.tensor(x_valid.values).shape)
print(torch.tensor(y_valid.values).shape)
x_valid = torch.tensor(x_valid.values).to(torch.float32)
y_valid = torch.tensor(y_valid.values).to(torch.float32)

torch.Size([3847, 11])
torch.Size([3847, 3])


In [76]:
for i in range(epochs):
    train_loss_sum = 0.0
    valid_loss_sum = 0.0
    for seq, labels in zip(x_train,y_train):
        #train 
        model.train()
        optimizer.zero_grad()
        model.hidden_cell = (torch.zeros(1, 1, model.hidden_layer_size),torch.zeros(1, 1, model.hidden_layer_size))
        y_pred = model(seq)

        train_loss = loss_function(y_pred, labels)
        train_loss.backward()
        optimizer.step()
        #valid 
    for seq, labels in zip(x_valid,y_valid):
        model.eval()
        y_pred = model(seq)
        valid_loss = loss_function(y_pred, labels)

    if i%5 == 1:
        print(f'epoch: {i:3} train_loss: {train_loss.item():10.8f}:valid_loss: {valid_loss.item():10.8f}')

print(f'epoch: {i:3} train_loss: {train_loss.item():10.8f}:valid_loss: {valid_loss.item():10.8f}')

epoch:   1 loss: 1.47198522
epoch:   6 loss: 1.49908423
epoch:  11 loss: 1.47494829
epoch:  16 loss: 1.47419512
epoch:  21 loss: 1.47258222
epoch:  26 loss: 1.47168720
epoch:  31 loss: 1.47164392
epoch:  36 loss: 1.47173715
epoch:  41 loss: 1.47226799


KeyboardInterrupt: 

In [16]:
test_x ,datatime= feature_engine(origin_test.copy())


In [17]:
test_x

Unnamed: 0,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,hour,month,1345_mean,mean_deg_c
0,41.3,0.4375,1108.8,745.7,797.1,880.0,1273.1,1.000000,0.833333,1014.8,8.0
1,51.7,0.4564,1249.5,864.9,687.9,972.8,1714.0,0.916667,0.847222,1156.0,7.0
2,51.5,0.4689,1102.6,878.0,693.7,941.9,1300.8,0.833333,0.861111,1009.8,6.0
3,52.3,0.4693,1139.7,916.2,725.6,1011.0,1283.0,0.750000,0.875000,1039.8,5.0
4,57.5,0.4650,1022.4,838.5,871.5,967.0,1142.3,0.666667,0.888889,1000.8,4.5
...,...,...,...,...,...,...,...,...,...,...,...
2242,28.7,0.7568,1340.3,1023.9,522.8,1374.0,1659.8,0.166667,0.972222,1224.2,12.1
2243,22.5,0.7119,1232.8,955.1,616.1,1226.1,1269.0,0.083333,0.986111,1086.0,13.1
2244,19.0,0.6406,1187.7,1052.4,572.8,1253.4,1081.1,0.000000,1.000000,1023.8,14.1
2245,12.7,0.5139,1053.2,1009.0,702.0,1009.8,808.5,0.083333,0.986111,893.4,15.1


# export result

In [23]:
result = pd.DataFrame([datatime,xgb_reg0_pred_test,xgb_reg1_pred_test,xgb_reg2_pred_test]).T
result.columns=['date_time','target_carbon_monoxide','target_benzene','target_nitrogen_oxides']

In [24]:
result.to_csv('sub_0809_04.csv',index = False)