In [103]:
import pandas as pd
import numpy as np
import xgboost as xgb
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix

from torchvision.transforms import transforms
from torch import nn
from torch.autograd import Variable
import torch

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Data Load

In [104]:
origin_train = pd.read_csv("train.csv")
origin_test = pd.read_csv('test.csv')
print(list(origin_train.columns))

['date_time', 'deg_C', 'relative_humidity', 'absolute_humidity', 'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'target_carbon_monoxide', 'target_benzene', 'target_nitrogen_oxides']


#function

In [105]:
def df_to_x_y(df):
    df = df.reset_index(drop=True)
    y_train = df[['target_carbon_monoxide','target_benzene','target_nitrogen_oxides']]
    x_train = df.drop(columns= ['target_carbon_monoxide','target_benzene','target_nitrogen_oxides'],axis = 1)
    x_train, x_valid , y_train,y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=23)
    return x_train,y_train, x_valid, y_valid


In [206]:
none_train = origin_train.copy()
print(len(none_train))
for i in  range(1,len(none_train.columns)):
    target = none_train.iloc[:,i]
    scope = target.mean()+(target.std()*2)
    column_name = none_train.columns[i]
    none_train = none_train[none_train[column_name]<float(scope)]
    print(f'{none_train.columns[i]}:{len(none_train)}')
clean_df = none_train.copy()

7111
deg_C:6873
relative_humidity:6775
absolute_humidity:6638
sensor_1:6368
sensor_2:6200
sensor_3:5846
sensor_4:5706
sensor_5:5513
target_carbon_monoxide:5276
target_benzene:5082
target_nitrogen_oxides:4809


## data engine

In [207]:
def feature_engine(df):
    df = df.reset_index(drop = True)
    # datetime_processing
    datatime = df['date_time']
    df['hour'] = pd.to_datetime(df['date_time']).dt.hour
    df['hour'] = abs(df['hour']-12)/12

    df['month'] = pd.to_datetime(df['date_time']).dt.month
    df['month'] = abs(df['hour']-6)/6

    df = df.drop(['date_time'],axis = 1)
    # mean of senser
    df['1345_mean'] = round((df['sensor_1'] + df['sensor_3']+ df['sensor_4'] + df['sensor_5'])/4,1)

    # adjust mean_deg_c
    gap = 1

    mean_deg_c = df['deg_C'].copy()
    change = 0
    for i in range(1,len(mean_deg_c)-1):
        #increase too much
        if mean_deg_c[i-1] - mean_deg_c[i] >  gap:
            mean_deg_c[i] = mean_deg_c[i-1] - gap
            change +=1
        #decrease too much
        if mean_deg_c[i-1] - mean_deg_c[i] <  gap*-1:
            mean_deg_c[i] = mean_deg_c[i-1] + gap
            change +=1
    df['mean_deg_c'] = mean_deg_c
    df = df.drop(['deg_C'],axis = 1)
    df = pd.DataFrame(df)
    return df ,datatime

In [240]:
df = clean_df.copy()
df ,datatime = feature_engine(df)
x_train,y_train,x_valid, y_valid = df_to_x_y(df)
print(x_train.head())

      relative_humidity  absolute_humidity  sensor_1  sensor_2  sensor_3  \
1737               32.2             1.1605    1071.4    1066.8     775.7   
2542               48.2             1.4337    1096.9     947.0     724.0   
2937               28.6             1.2770     819.1     613.8    1108.8   
952                41.2             1.0324    1056.6    1073.8     891.4   
337                46.9             0.6280     898.6     547.0    1496.0   

      sensor_4  sensor_5      hour     month  1345_mean  mean_deg_c  
1737    1734.7    1160.9  0.833333  0.861111     1185.7        25.0  
2542    1775.7    1168.0  0.416667  0.930556     1191.2        24.8  
2937    1402.8     427.2  0.083333  0.986111      939.5        30.0  
952     1558.7     762.3  0.500000  0.916667     1067.2        21.9  
337     1176.1     486.9  0.666667  0.888889     1014.4        10.7  


In [231]:
def df_to_series(df,step = 12):
    for idx in range(step,len(df)):
        if idx == step:
            series_df = [df[idx-step:idx].values]
        else :
            series_df = np.concatenate([series_df,[df[idx-step:idx].values]],axis = 0)
    return series_df


In [232]:
x_train_series = torch.Tensor(df_to_series(x_train).astype('float32'))
y_train_series = torch.Tensor(y_train[12:].values)
x_valid_series = torch.Tensor(df_to_series(x_valid).astype('float32'))
y_valid_series = torch.Tensor(y_valid[12:].values)

In [233]:
print(x_train_series.shape)
print(y_train_series.shape)
print(x_valid_series.shape)
print(y_valid_series.shape)

torch.Size([3835, 12, 11])
torch.Size([3835, 3])
torch.Size([950, 12, 11])
torch.Size([950, 3])


# LSTM

In [288]:
class RNN(nn.Module):
    def __init__(self, input_size=11, hidden_layer_size=100, output_size=3):
        super().__init__()
        self.hidden_layer_size = hidden_layer_size
        self.lstm = nn.LSTM(input_size, hidden_layer_size)
        self.linear1 = nn.Linear(hidden_layer_size, hidden_layer_size)
        self.linear2 = nn.Linear(hidden_layer_size, output_size)
        self.bn1 = nn.BatchNorm2d(1024)
        
        self.hidden_cell = (torch.zeros(1,1,self.hidden_layer_size).cuda(),
                            torch.zeros(1,1,self.hidden_layer_size).cuda())

    def forward(self, input_seq):
        lstm_out, self.hidden_cell = self.lstm(input_seq.view(len(input_seq) ,1, -1).cuda(), self.hidden_cell)
        x = self.linear1(lstm_out.view(len(input_seq), -1))
        predictions = self.linear2(x)
        return predictions[-1]

class RMSLELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()

    def forward(self, pred, actual):
        return torch.sqrt(self.mse(torch.log(pred + 1), torch.log(actual + 1)))


In [289]:
model = RNN().cuda()
loss_function = RMSLELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.00001)
epochs = 150
print(model)

RNN(
  (lstm): LSTM(11, 100)
  (linear1): Linear(in_features=100, out_features=100, bias=True)
  (linear2): Linear(in_features=100, out_features=3, bias=True)
  (bn1): BatchNorm2d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)


In [291]:
for i in range(epochs):
    best_loss = 0.3
    train_loss_sum = 0.0
    valid_loss_sum = 0.0
    for seq, labels in zip(x_train_series.cuda(),y_train_series.cuda()):
        #train 
        model.train()
        optimizer.zero_grad()
        model.hidden_cell = (torch.zeros(1, 1, model.hidden_layer_size).cuda(),torch.zeros(1, 1, model.hidden_layer_size).cuda())
        y_pred = model(seq.cuda())

        train_loss = loss_function(y_pred.cuda(), labels.cuda())
        train_loss.backward()
        optimizer.step()
        #valid 
    for seq, labels in zip(x_valid_series,y_valid_series):
        model.eval()
        y_pred = model(seq.cuda())
        valid_loss = loss_function(y_pred.cuda(), labels.cuda())

    if i%5 == 1:
        print(f'epoch: {i:3} train_loss: {train_loss.item():10.8f}:valid_loss: {valid_loss.item():10.8f}')

    if valid_loss < best_loss:
        best_loss = valid_loss
        torch.save(model,'/0816_01_model.pt')
        torch.save(model.state_dict(),'/0816_01_dict.pt')
        print(f'Current best is {best_loss:10.8f}')
print(f'epoch: {i:3} train_loss: {train_loss.item():10.8f}:valid_loss: {valid_loss.item():10.8f}')

epoch:   1 train_loss: 0.33404896:valid_loss: 0.31100070
Current best is 0.20284870
Current best is 0.15981987
Current best is 0.15374243
Current best is 0.16280587
epoch:   6 train_loss: 0.48448703:valid_loss: 0.17505269
Current best is 0.17505269
Current best is 0.18572873
Current best is 0.19414829
Current best is 0.20051369
Current best is 0.20525654
epoch:  11 train_loss: 0.52740455:valid_loss: 0.20876364
Current best is 0.20876364
Current best is 0.21133794
Current best is 0.21322104
Current best is 0.21460883
Current best is 0.21561192
epoch:  16 train_loss: 0.53658223:valid_loss: 0.21632990
Current best is 0.21632990
Current best is 0.21684374
Current best is 0.21720785
Current best is 0.21747276
Current best is 0.21764776
epoch:  21 train_loss: 0.53888059:valid_loss: 0.21775483
Current best is 0.21775483
Current best is 0.21781777
Current best is 0.21794477
Current best is 0.21802364
Current best is 0.21803035
epoch:  26 train_loss: 0.53986382:valid_loss: 0.21801166
Current be

In [381]:
before_x_test = clean_df.copy().tail(12).reset_index(drop = True)
before_x_test = before_x_test.drop(['target_carbon_monoxide','target_benzene','target_nitrogen_oxides'],axis = 1)
test_df = pd.concat([before_x_test,origin_test],axis = 0)
test_df = test_df.reset_index(drop=True)
x_test , datatime = feature_engine(test_df)
x_test_series = torch.Tensor(df_to_series(x_test.astype('float32')))

In [382]:
model.eval()
result = []
for sqe in x_test_series:
    pred = model(sqe.cuda())

    break
pred

tensor([nan, nan, nan], device='cuda:0', grad_fn=<SelectBackward>)

# export result

In [None]:
result = pd.DataFrame([datatime,xgb_reg0_pred_test,xgb_reg1_pred_test,xgb_reg2_pred_test]).T
result.columns=['date_time','target_carbon_monoxide','target_benzene','target_nitrogen_oxides']

In [None]:
result.to_csv('sub_0809_04.csv',index = False)