In [4]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from torchvision.transforms import transforms
from torch import nn
from torch.autograd import Variable
import torch

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Data Load

In [5]:
origin_train = pd.read_csv("train.csv")
origin_test = pd.read_csv('test.csv')
print(list(origin_train.columns))

['date_time', 'deg_C', 'relative_humidity', 'absolute_humidity', 'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']


#function

In [6]:
def df_to_x_y(df):
    df = df.reset_index(drop=True)
    y_train = df[['target_carbon_monoxide','target_benzene','target_nitrogen_oxides']]
    x_train = df.drop(columns= ['target_carbon_monoxide','target_benzene','target_nitrogen_oxides'],axis = 1)
    x_train, x_valid , y_train,y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=23)
    return x_train,y_train, x_valid, y_valid


In [7]:
none_train = origin_train.copy()
print(len(none_train))
for i in  range(1,len(none_train.columns)):
    target = none_train.iloc[:,i]
    scope = target.mean()+(target.std()*2)
    column_name = none_train.columns[i]
    none_train = none_train[none_train[column_name]<float(scope)]
    print(f'{none_train.columns[i]}:{len(none_train)}')
clean_df = none_train.copy()

7111
deg_C:6873
relative_humidity:6775
absolute_humidity:6638
sensor_1:6368
sensor_2:6200
sensor_3:5846
sensor_4:5706
sensor_5:5513
target_carbon_monoxide:5276
target_benzene:5082
target_nitrogen_oxides:4809


## data engine

In [8]:
def feature_engine(df):
    df = df.reset_index(drop = True)
    # datetime_processing
    datatime = df['date_time']
    df['hour'] = pd.to_datetime(df['date_time']).dt.hour
    df['hour'] = abs(df['hour']-12)/12

    df['month'] = pd.to_datetime(df['date_time']).dt.month
    df['month'] = abs(df['hour']-6)/6

    df = df.drop(['date_time'],axis = 1)
    # mean of senser
    df['1345_mean'] = round((df['sensor_1'] + df['sensor_3']+ df['sensor_4'] + df['sensor_5'])/4,1)

    # adjust mean_deg_c
    gap = 1

    mean_deg_c = df['deg_C'].copy()
    change = 0
    for i in range(1,len(mean_deg_c)-1):
        #increase too much
        if mean_deg_c[i-1] - mean_deg_c[i] >  gap:
            mean_deg_c[i] = mean_deg_c[i-1] - gap
            change +=1
        #decrease too much
        if mean_deg_c[i-1] - mean_deg_c[i] <  gap*-1:
            mean_deg_c[i] = mean_deg_c[i-1] + gap
            change +=1
    df['mean_deg_c'] = mean_deg_c
    df = df.drop(['deg_C'],axis = 1)
    df = pd.DataFrame(df)
    return df ,datatime

In [9]:
df = clean_df.copy()
df ,datatime = feature_engine(df)

x_train,y_train,x_valid, y_valid = df_to_x_y(df)
min_max_scaler = preprocessing.MinMaxScaler()
x_train = pd.DataFrame(min_max_scaler.fit_transform(x_train))
x_valid = pd.DataFrame(min_max_scaler.fit_transform(x_valid))
print(x_train)


            0         1         2         3         4         5         6   \
0     0.317872  0.562694  0.488849  0.723723  0.272981  0.772650  0.659181   
1     0.536153  0.722544  0.517427  0.588904  0.223217  0.802924  0.664698   
2     0.268759  0.630858  0.206097  0.213932  0.593609  0.527579  0.089051   
3     0.440655  0.487742  0.472263  0.731600  0.384349  0.642694  0.349444   
4     0.518417  0.251126  0.295192  0.138758  0.966311  0.360186  0.135442   
...        ...       ...       ...       ...       ...       ...       ...   
3842  0.368349  0.400737  0.242071  0.230587  0.876600  0.365281  0.199394   
3843  0.508868  0.528524  0.107363  0.260072  0.543748  0.382633  0.230943   
3844  0.609823  0.656954  0.371064  0.532185  0.313697  0.714391  0.531743   
3845  0.271487  0.260781  0.346632  0.580239  0.352969  0.539688  0.305385   
3846  0.237381  0.244222  0.273114  0.256696  0.851670  0.347781  0.069936   

            7         8         9         10  
0     0.833333  

In [10]:
def df_to_series(df,step = 12):
    for idx in range(step,len(df)):
        if idx == step:
            series_df = [df[idx-step:idx].values]
        else :
            series_df = np.concatenate([series_df,[df[idx-step:idx].values]],axis = 0)
    return series_df


In [11]:
x_train_series = torch.Tensor(df_to_series(x_train).astype('float32'))
y_train_series = torch.Tensor(y_train[12:].values)
x_valid_series = torch.Tensor(df_to_series(x_valid).astype('float32'))
y_valid_series = torch.Tensor(y_valid[12:].values)

In [12]:
print(x_train_series.shape)
print(y_train_series.shape)
print(x_valid_series.shape)
print(y_valid_series.shape)

torch.Size([3835, 12, 11])
torch.Size([3835, 3])
torch.Size([950, 12, 11])
torch.Size([950, 3])


# LSTM

In [13]:
class RNN(nn.Module):
    def __init__(self):
        super(RNN,self).__init__()
        self.rnn=nn.LSTM(
            input_size=11,      # picture length
            hidden_size=10,     # rnn hidden unit
            num_layers=5,       # rnn layers
            batch_first=True,   # (batch,time_step,input_size)
        )
        self.out=nn.Linear(10,1)

    def forward(self,x):
        # x shape (batch, time_step, input_size)
        # r_out shape (batch, time_step, output_size)
        # h_n\h_c shape (n_layers, batch, hidden_size)
        r_out,(h_n,h_c)=self.rnn(x,None)
        out=self.out(r_out[:,-1,:])
        return out

rnn=RNN()
print(rnn)

RNN(
  (rnn): LSTM(11, 10, num_layers=5, batch_first=True)
  (out): Linear(in_features=10, out_features=1, bias=True)
)


In [40]:
class RMSLELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self, pred, actual):
        return torch.sqrt(self.mse(torch.log(pred + 1), torch.log(actual + 1)))

In [41]:
model1 = RNN().cuda()
model2 = RNN().cuda()
model3 = RNN().cuda()

model1_hist = dict()
model2_hist = dict()
model3_hist = dict()

loss_function = RMSLELoss
optimizer1 = torch.optim.Adam(model1.parameters(), lr=0.000001)
optimizer2 = torch.optim.Adam(model2.parameters(), lr=0.000001)
optimizer3 = torch.optim.Adam(model3.parameters(), lr=0.000001)

In [42]:
def train_model(model ,x_train ,y_train,x_valid,y_valid,best_loss, epochs ,file_path ,optimizer):
    hist_train = []
    hist_valid = []
    best_loss = best_loss
    for epoch in range(epochs):
        for seq, labels in zip(x_train.cuda(),y_train.cuda()):
            #train 
            optimizer.zero_grad()
            model.train()
            seq = torch.unsqueeze(seq,0)
            y_pred = model(seq.cuda())

            train_loss = loss_function(y_pred.cuda(), labels.cuda())
            
            train_loss.backward()
            optimizer.step()

            #valid 
        for seq, labels in zip(x_valid_series,y_valid_series):
            model.eval()
            seq = torch.unsqueeze(seq,0)
            y_pred = model(seq.cuda())
            valid_loss = loss_function(y_pred.cuda(), labels.cuda())

        if epoch%5 == 1:
            
            print(f'epoch: {epoch:3} train_loss: {train_loss.item():10.4f}| valid_loss: {valid_loss.item():10.8f}')
            if valid_loss < best_loss:
                best_loss = valid_loss
                torch.save(model,file_path)
                torch.save(model.state_dict(),file_path)
                print(f'Current best is {best_loss:10.4f}')
            hist_train.append(train_loss.item())
            hist_valid.append(valid_loss.item())
    print(f'epoch: {epoch:3} train_loss: {train_loss.item():10.4f}| valid_loss: {valid_loss.item():10.4f} \n ----------------train_end---------------------')

    return model ,hist_train ,hist_valid

In [43]:
model1 ,hist_train_1_2 ,hist_valid_1_2 = train_model(
    model1 ,
    x_train_series ,
    y_train_series[:,0],
    x_valid_series,
    y_valid_series[:,0],
    best_loss=0.25, 
    epochs=150 ,
    optimizer = optimizer1,
    file_path='model_1_0819_01.pt')

TypeError: __init__() takes 1 positional argument but 3 were given

In [None]:
model2 ,hist_train_2_2 ,hist_valid_2_2 = train_model(
    model2 ,
    x_train_series ,
    y_train_series[:,1] ,
    x_valid_series ,
    y_valid_series[:,1] ,
    best_loss = 0.25 ,
    epochs=150 ,
    optimizer = optimizer2,
    file_path = 'model_2_0819_01.pt')

epoch:   1 train_loss:     1.3287| valid_loss: 2.77278280
epoch:   6 train_loss:     1.1595| valid_loss: 2.63359308
epoch:  11 train_loss:     0.9065| valid_loss: 2.43281603
epoch:  16 train_loss:     0.6866| valid_loss: 2.26687169
epoch:  21 train_loss:     0.5307| valid_loss: 2.15497708
epoch:  26 train_loss:     0.4140| valid_loss: 2.07495379
epoch:  31 train_loss:     0.3190| valid_loss: 2.01244950
epoch:  36 train_loss:     0.2375| valid_loss: 1.96092772
epoch:  41 train_loss:     0.1653| valid_loss: 1.91699564
epoch:  46 train_loss:     0.1016| valid_loss: 1.87966490
epoch:  49 train_loss:     0.0680| valid_loss:     1.8606 
 ----------------train_end---------------------


In [None]:
model3 ,hist_train_3_2 ,hist_valid_3_2 = train_model(
    model3 ,
    x_train_series ,
    y_train_series[:,2] ,
    x_valid_series,y_valid_series[:,2] ,
    best_loss=0.25 ,
    epochs=150 ,
    optimizer = optimizer3,
    file_path='model_3_0819_01.pt')

epoch:   1 train_loss:     3.8901| valid_loss: 2.63512731
epoch:   6 train_loss:     3.7565| valid_loss: 2.52778625
epoch:  11 train_loss:     3.5567| valid_loss: 2.37237859
epoch:  16 train_loss:     3.3898| valid_loss: 2.24797201
epoch:  21 train_loss:     3.2652| valid_loss: 2.15885115
epoch:  26 train_loss:     3.1581| valid_loss: 2.08511591
epoch:  31 train_loss:     3.0616| valid_loss: 2.02123642
epoch:  36 train_loss:     2.9757| valid_loss: 1.96666873
epoch:  41 train_loss:     2.8993| valid_loss: 1.92000639
epoch:  46 train_loss:     2.8299| valid_loss: 1.87932634
epoch:  49 train_loss:     2.7910| valid_loss:     1.8573 
 ----------------train_end---------------------


In [None]:
for i in range(epochs):
    for seq, labels in zip(x_train_series.cuda(),y_train_series.cuda()):
        #train 
        model.train()
        optimizer.zero_grad()
        model.hidden_cell = (torch.zeros(1, 1, model.hidden_layer_size).cuda(),torch.zeros(1, 1, model.hidden_layer_size).cuda())
        y_pred = model(seq.cuda())

        train_loss = loss_function(y_pred.cuda(), labels.cuda())
        train_loss.backward()
        optimizer.step()
        #valid 
    for seq, labels in zip(x_valid_series,y_valid_series):
        model.eval()
        y_pred = model(seq.cuda())
        valid_loss = loss_function(y_pred.cuda(), labels.cuda())

    if i%5 == 1:
        print(f'epoch: {i:3} train_loss: {train_loss.item():10.4f}:valid_loss: {valid_loss.item():10.8f}')
        if valid_loss < best_loss:
            best_loss = valid_loss
            torch.save(model,'0818_01_model.pt')
            torch.save(model.state_dict(),'0818_01_dict.pt')
            print(f'Current best is {best_loss:10.4f}')
print(f'epoch: {i:3} train_loss: {train_loss.item():10.4f}:valid_loss: {valid_loss.item():10.4f}')

In [None]:
before_x_test = clean_df.copy().tail(12).reset_index(drop = True)
before_x_test = before_x_test.drop(['target_carbon_monoxide','target_benzene','target_nitrogen_oxides'],axis = 1)
test_df = pd.concat([before_x_test,origin_test],axis = 0)
test_df = test_df.reset_index(drop=True)
x_test , datatime = feature_engine(test_df)
x_test_series = torch.Tensor(df_to_series(x_test.astype('float32')))

In [None]:
pred_model = torch.load('0818_01_model.pt')
print( pred_model)

RNN(
  (lstm): LSTM(11, 32)
  (linear_h_h): Linear(in_features=32, out_features=32, bias=True)
  (linear_output): Linear(in_features=32, out_features=3, bias=True)
)


In [None]:
model.eval()
result = []
for sqe in x_test_series:
    pred = pred_model(sqe.cuda())
    print (pred) 

tensor([ 1.2948,  6.3110, 94.1298], device='cuda:0', grad_fn=<SelectBackward>)
tensor([ 1.2948,  6.3109, 94.1275], device='cuda:0', grad_fn=<SelectBackward>)
tensor([ 1.2948,  6.3108, 94.1272], device='cuda:0', grad_fn=<SelectBackward>)
tensor([ 1.2948,  6.3108, 94.1267], device='cuda:0', grad_fn=<SelectBackward>)
tensor([ 1.1414,  5.5043, 81.3213], device='cuda:0', grad_fn=<SelectBackward>)
tensor([ 1.0235,  4.6397, 66.0414], device='cuda:0', grad_fn=<SelectBackward>)
tensor([ 0.9278,  4.1882, 59.7230], device='cuda:0', grad_fn=<SelectBackward>)
tensor([ 0.7692,  3.3183, 44.0627], device='cuda:0', grad_fn=<SelectBackward>)
tensor([ 0.7776,  3.3723, 44.8081], device='cuda:0', grad_fn=<SelectBackward>)
tensor([ 0.7783,  3.3762, 44.8624], device='cuda:0', grad_fn=<SelectBackward>)
tensor([ 0.7782,  3.3761, 44.8608], device='cuda:0', grad_fn=<SelectBackward>)
tensor([ 0.7783,  3.3762, 44.8621], device='cuda:0', grad_fn=<SelectBackward>)
tensor([ 0.7783,  3.3762, 44.8624], device='cuda:0',

# export result

In [None]:
result = pd.DataFrame([datatime,xgb_reg0_pred_test,xgb_reg1_pred_test,xgb_reg2_pred_test]).T
result.columns=['date_time','target_carbon_monoxide','target_benzene','target_nitrogen_oxides']

In [None]:
result.to_csv('sub_0809_04.csv',index = False)