# Import

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import glob

from sklearn.preprocessing import MinMaxScaler

from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.optimizers import SGD, Adam, RMSprop
from keras.losses import mse # mean squared error 평가 지표

# 데이터셋 불러오기

In [2]:
# 모든 train 파일 불러오기

all_input_list = glob.glob('data/train_input/*.csv')
all_input_list.sort()

all_target_list = glob.glob('data/train_target/*.csv')
all_target_list.sort()

all_input_list

['data/train_input/CASE_01.csv',
 'data/train_input/CASE_02.csv',
 'data/train_input/CASE_03.csv',
 'data/train_input/CASE_04.csv',
 'data/train_input/CASE_05.csv',
 'data/train_input/CASE_06.csv',
 'data/train_input/CASE_07.csv',
 'data/train_input/CASE_08.csv',
 'data/train_input/CASE_09.csv',
 'data/train_input/CASE_10.csv',
 'data/train_input/CASE_11.csv',
 'data/train_input/CASE_12.csv',
 'data/train_input/CASE_13.csv',
 'data/train_input/CASE_14.csv',
 'data/train_input/CASE_15.csv',
 'data/train_input/CASE_16.csv',
 'data/train_input/CASE_17.csv',
 'data/train_input/CASE_18.csv',
 'data/train_input/CASE_19.csv',
 'data/train_input/CASE_20.csv',
 'data/train_input/CASE_21.csv',
 'data/train_input/CASE_22.csv',
 'data/train_input/CASE_23.csv',
 'data/train_input/CASE_24.csv',
 'data/train_input/CASE_25.csv',
 'data/train_input/CASE_26.csv',
 'data/train_input/CASE_27.csv',
 'data/train_input/CASE_28.csv']

In [3]:
# 데이터 확인

## 총 파일 개수
print('=== 총 파일 개수 ===')
print(f'train 파일 개수 : {len(all_input_list)}')
print(f'target 파일 개수 : {len(all_target_list)}\n')


## 파일별 row 개수
### input 파일
print('=== input row 개수 ===')
for i in range(len(all_input_list)):
    print(f'{i+1}일차 input row 개수 : {pd.read_csv(all_input_list[i])["DAT"].count()}')
print('\n')
### target 파일
print('=== target row 개수 ===')
for i in range(len(all_target_list)):
    print(f'{i+1}일차 target row 개수 : {pd.read_csv(all_target_list[i])["DAT"].count()}')
print('\n')


## columns 확인
### input columns 확인
print('=== input columns 확인 ===')
for i in range(len(all_input_list)):
    if i == 0:
        print(f'input column : {pd.read_csv(all_input_list[i]).columns}')
    else:
        if list(pd.read_csv(all_input_list[0]).columns) == list(pd.read_csv(all_input_list[i]).columns):
            print(f'{i+1}일차 동일')
        else:
            print(f'{i+1}일차 상이함. 확인 필요')
print('\n')
### target columns 확인
print('=== target columns 확인 ===')
for i in range(len(all_target_list)):
    if i == 0:
        print(f'input column : {pd.read_csv(all_target_list[i]).columns}')
    else:
        if list(pd.read_csv(all_target_list[0]).columns) == list(pd.read_csv(all_target_list[i]).columns):
            print(f'{i+1}일차 동일')
        else:
            print(f'{i+1}일차 상이함. 확인 필요')
print('\n')

=== 총 파일 개수 ===
train 파일 개수 : 28
target 파일 개수 : 28

=== input row 개수 ===
1일차 input row 개수 : 672
2일차 input row 개수 : 672
3일차 input row 개수 : 672
4일차 input row 개수 : 672
5일차 input row 개수 : 672
6일차 input row 개수 : 672
7일차 input row 개수 : 672
8일차 input row 개수 : 672
9일차 input row 개수 : 672
10일차 input row 개수 : 672
11일차 input row 개수 : 672
12일차 input row 개수 : 672
13일차 input row 개수 : 672
14일차 input row 개수 : 672
15일차 input row 개수 : 672
16일차 input row 개수 : 672
17일차 input row 개수 : 672
18일차 input row 개수 : 672
19일차 input row 개수 : 672
20일차 input row 개수 : 672
21일차 input row 개수 : 672
22일차 input row 개수 : 672
23일차 input row 개수 : 672
24일차 input row 개수 : 672
25일차 input row 개수 : 672
26일차 input row 개수 : 672
27일차 input row 개수 : 672
28일차 input row 개수 : 672


=== target row 개수 ===
1일차 target row 개수 : 28
2일차 target row 개수 : 28
3일차 target row 개수 : 28
4일차 target row 개수 : 28
5일차 target row 개수 : 28
6일차 target row 개수 : 28
7일차 target row 개수 : 28
8일차 target row 개수 : 28
9일차 target row 개수 : 28
10일차 target row 개수 : 28
11일차 targ

In [4]:
# train 파일 하나로 통합

## 초기 데이터프레임 만들기 (1일차 파일)
a = all_input_list.copy()
input_all = pd.read_csv(a[0])
a.pop(0)

b = all_target_list.copy()
target_all = pd.read_csv(b[0])
b.pop(0)


## 통합 데이터프레임 만들기 : input
for i in range(len(a)):
    input_all = pd.concat([input_all, pd.read_csv(a[i])])

    
## 통합 데이터프레임 만들기 : target
for i in range(len(b)):
    target_all = pd.concat([target_all, pd.read_csv(b[i])])

# EDA

In [5]:
# input과 target info 확인

print(input_all.info(), target_all.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18816 entries, 0 to 671
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   DAT       18816 non-null  int64  
 1   obs_time  18816 non-null  object 
 2   내부온도관측치   18816 non-null  float64
 3   내부습도관측치   18816 non-null  float64
 4   co2관측치    18816 non-null  float64
 5   ec관측치     18816 non-null  float64
 6   시간당분무량    18816 non-null  float64
 7   일간누적분무량   18816 non-null  float64
 8   시간당백색광량   18816 non-null  float64
 9   일간누적백색광량  18816 non-null  float64
 10  시간당적색광량   18816 non-null  float64
 11  일간누적적색광량  18816 non-null  float64
 12  시간당청색광량   18816 non-null  float64
 13  일간누적청색광량  18816 non-null  float64
 14  시간당총광량    18816 non-null  float64
 15  일간누적총광량   18816 non-null  float64
dtypes: float64(14), int64(1), object(1)
memory usage: 2.4+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 784 entries, 0 to 27
Data columns (total 2 columns):
 #   Column              

In [6]:
# obs_time 을 int 형으로 변환

input_all['obs_time'] = input_all['obs_time'].str.replace(':00:00.','').str.replace(':59:59.','.9999').str.replace(':00','')
input_all['obs_time'] = input_all['obs_time'].astype(float)
input_all['obs_time'] = [round(x) for x in input_all['obs_time']]

  input_all['obs_time'] = input_all['obs_time'].str.replace(':00:00.','').str.replace(':59:59.','.9999').str.replace(':00','')


In [7]:
# input 전체 describe

input_all.describe()

Unnamed: 0,DAT,obs_time,내부온도관측치,내부습도관측치,co2관측치,ec관측치,시간당분무량,일간누적분무량,시간당백색광량,일간누적백색광량,시간당적색광량,일간누적적색광량,시간당청색광량,일간누적청색광량,시간당총광량,일간누적총광량
count,18816.0,18816.0,18816.0,18816.0,18816.0,18816.0,18816.0,18816.0,18816.0,18816.0,18816.0,18816.0,18816.0,18816.0,18816.0,18816.0
mean,13.5,11.5,26.027719,63.93509,498.860695,1.507566,523.627918,6365.916673,6315.56293,72910.240149,1571.862912,18201.358998,1241.905981,14139.184959,9129.331824,105250.784106
std,8.077962,6.922371,3.948988,11.657905,120.421248,1.04201,544.323281,6903.680044,8188.067575,86040.447105,2937.252667,34386.384982,2474.426437,27743.227265,9175.427411,97397.224913
min,0.0,0.0,0.0,0.0,60.4,0.0,-653.26,0.0,-138371.2461,0.0,-8793.6072,0.0,-18570.8575,0.0,-165735.7108,0.0
25%,6.75,5.75,23.97125,57.494583,420.245833,0.929581,0.0,1234.28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7280.96285
50%,13.5,11.5,26.291667,62.118333,473.983333,1.308517,252.0,3360.485,0.0,23205.75,0.0,0.0,0.0,0.0,9242.35,92172.36
75%,20.25,17.25,28.106667,73.157166,551.105952,2.291322,769.0,9990.57,18236.6254,145422.7,1494.6981,17529.2964,1306.461,18484.7,18255.19,177491.49945
max,27.0,23.0,42.643334,93.350002,1714.85,5.462324,3144.13,42855.23,18564.6,292014.9698,9928.8,200561.76,9399.0,190016.45,37892.4,559488.881


In [8]:
# input 일자별 describe

## 통계 항목 리스트 생성
stat_list = ['mean', 'std', 'min', '25%', '50%', '75%', 'max']

## 일자별 통계값
for i in stat_list:
    temp = []
    for j in input_all['DAT'].unique():
        desc = input_all[input_all['DAT'] == j].describe()
        desc['DAT'] = j
        temp.append(np.array(desc.loc[i]))
    globals()[f'{i}_list'] = pd.DataFrame(temp, columns = input_all.columns)

## mean_list, std_list 등으로 뽑아볼 수 있음
mean_list

Unnamed: 0,DAT,obs_time,내부온도관측치,내부습도관측치,co2관측치,ec관측치,시간당분무량,일간누적분무량,시간당백색광량,일간누적백색광량,시간당적색광량,일간누적적색광량,시간당청색광량,일간누적청색광량,시간당총광량,일간누적총광량
0,0.0,11.5,25.781004,54.91236,534.71848,1.27397,430.609479,5194.759955,6380.927437,73232.291462,1283.393438,13888.711775,801.58178,8039.732565,8465.902654,95160.735801
1,1.0,11.5,26.529081,55.476487,524.807603,1.34969,463.040372,5580.083393,6700.812255,76016.976857,1599.930575,17718.68235,931.843714,9834.615211,9232.586544,103570.274418
2,2.0,11.5,26.913901,53.793927,521.02143,1.387045,481.23628,5782.935387,6434.134907,73101.450629,1644.3393,18563.704,1048.321847,11915.652183,9126.796054,103580.806812
3,3.0,11.5,26.17578,55.261367,518.505765,1.404867,461.618125,5542.942143,6532.731718,74304.166896,1645.853738,18900.8892,1035.088186,11858.63811,9213.673642,105063.694206
4,4.0,11.5,26.05291,57.392406,504.098784,1.428326,471.997857,5754.999717,6674.259168,76458.632006,1586.80545,17880.781338,1010.81676,11556.816452,9271.881378,105896.229796
5,5.0,11.5,26.46065,58.141788,503.541781,1.425469,464.295045,5778.467083,6781.001013,78299.888557,1637.092162,18927.19855,866.1696,10910.858988,9284.262776,108137.946095
6,6.0,11.5,26.365303,58.045569,521.691202,1.425745,444.630878,5401.074955,6653.816007,76324.185994,1538.380387,16560.443012,1007.585854,11177.681493,9199.782249,104062.3105
7,7.0,11.5,26.222896,58.472996,525.249251,1.474636,482.351503,5848.971801,6780.660294,78198.432465,1690.48655,19234.417588,1050.342912,11854.071483,9521.489756,109286.921536
8,8.0,11.5,26.136378,59.792714,516.875023,1.540175,491.712902,5949.714613,6466.830151,74962.749764,1702.32625,19395.071087,1062.753695,12042.578312,9231.910096,106400.399164
9,9.0,11.5,25.966916,60.785721,524.486606,1.557607,513.493363,6245.695357,6570.675882,76592.691256,1658.102212,18964.0671,1074.490789,12205.319479,9303.268884,107762.077835


# 전처리

## 이상치 처리 : 추가 예정

## 스케일링

In [24]:
# input -> reshape
input_all_scaled = np.array(MinMaxScaler().fit_transform(input_all))
input_all_scaled = input_all_scaled.reshape(784, 24, 16)

# target -> minmax 스케일링
target_all_scaled = pd.DataFrame(MinMaxScaler().fit_transform(target_all),
                                   columns = target_all.columns)['predicted_weight_g']

## input 모양 설정

In [10]:
input_all_scaled

array([[[0.        , 0.        , 0.59329321, ..., 0.        ,
         0.81391371, 0.        ],
        [0.        , 0.04347826, 0.6022127 , ..., 0.        ,
         0.81391371, 0.        ],
        [0.        , 0.08695652, 0.59266786, ..., 0.        ,
         0.81391371, 0.        ],
        ...,
        [0.        , 0.91304348, 0.61635269, ..., 0.        ,
         0.81391371, 0.48146854],
        [0.        , 0.95652174, 0.60908308, ..., 0.        ,
         0.81391371, 0.48146854],
        [0.        , 1.        , 0.60673804, ..., 0.        ,
         0.81391371, 0.48146854]],

       [[0.03703704, 0.        , 0.60517469, ..., 0.        ,
         0.81391371, 0.        ],
        [0.03703704, 0.04347826, 0.60926368, ..., 0.        ,
         0.81391371, 0.        ],
        [0.03703704, 0.08695652, 0.59993747, ..., 0.        ,
         0.81391371, 0.        ],
        ...,
        [0.03703704, 0.91304348, 0.62389588, ..., 0.        ,
         0.81391371, 0.4826421 ],
        [0.0

# 딥러닝 모델 제작

In [25]:
# 모델 생성
model = Sequential()
# model.add(Dense(784, activation='relu'))
model.add(Dense(4, input_shape=(784,24), activation='relu'))
model.add(Dense(2, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

#컴파일
model.compile(optimizer = Adam(),
              loss = 'mse',
              metrics = ['mae'])

In [16]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 18816)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 37634     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 3         
Total params: 37,637
Trainable params: 37,637
Non-trainable params: 0
_________________________________________________________________


In [None]:
# 모델 학습
model.fit(input_all_scaled, target_all_scaled,
          epochs = 1, batch_size = 7)