https://www.kaggle.com/vaibhavsxn/time-series-multivariate-lstm

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# load_dtypes()
# DF 불러올 때부터 축소해서 가져오는 함수.

def load_dtypes(file_path):
    print(file_path)
    tmp = pd.read_csv(file_path, nrows=0) # 애초에 파일을 불러올 때
    col_dtypes= {}
    for col in tmp.columns:
        df = pd.read_csv(file_path, usecols=[col]) # 애초에 파일을 불러올 때
        
        dtype = df[col].dtype
        
        if (dtype == 'int8' or dtype == 'int16' or dtype == 'int32' or dtype == 'int64'):
            c_min = df[col].min()
            c_max = df[col].max()
            dtype = 'int'
            
        elif dtype == 'float32' or dtype == 'float64':
            c_min = df[col].min()
            c_max = df[col].max()
            dtype = 'float'
            
        elif dtype == 'object':
            n_unique = df[col].nunique()
            threshold = n_unique / df.shape[0]
            
        if dtype == 'int':
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                col_dtype = 'int8'
            elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
                col_dtype = 'uint8'
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                col_dtype = 'int16'
            elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
                col_dtype = 'uint16'
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                col_dtype = 'int32'
            elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
                col_dtype = 'uint32'
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                col_dtype = 'int64'
            elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:
                col_dtype = 'uint64'
                
        elif dtype == 'float':
            if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                col_dtype = 'float32'
            else:
                col_dtype = 'float64'
                
        elif dtype == 'object':
            if threshold > 0.7:
                col_dtype = 'object'
            else:
                col_dtype = 'category'
                
        col_dtypes[col] = col_dtype
        

    df = pd.read_csv(file_path, dtype=col_dtypes)
    
    return df

In [2]:
TRAIN_PATH = r'C:\Users\Wyatt\wyatt37\Data\solarpanel\train\train.csv'
TEST_PATH = r'C:\Users\Wyatt\wyatt37\Data\solarpanel\test'
SUBMISSION_PATH = r'C:\Users\Wyatt\wyatt37\Data\solarpanel\sample_submission.csv'

In [4]:
train = load_dtypes(TRAIN_PATH)
train.info()

C:\Users\Wyatt\wyatt37\Data\solarpanel\train\train.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52560 entries, 0 to 52559
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Day     52560 non-null  int16  
 1   Hour    52560 non-null  int8   
 2   Minute  52560 non-null  int8   
 3   DHI     52560 non-null  int16  
 4   DNI     52560 non-null  int16  
 5   WS      52560 non-null  float32
 6   RH      52560 non-null  float32
 7   T       52560 non-null  int8   
 8   TARGET  52560 non-null  float32
dtypes: float32(3), int16(3), int8(3)
memory usage: 1.1 MB


In [5]:
train.head()

Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET
0,0,0,0,0,0,1.5,69.080002,-12,0.0
1,0,0,30,0,0,1.5,69.059998,-12,0.0
2,0,1,0,0,0,1.6,71.779999,-12,0.0
3,0,1,30,0,0,1.6,71.75,-12,0.0
4,0,2,0,0,0,1.6,75.199997,-12,0.0


In [14]:
train = train[:4800]

In [15]:
train

Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET
0,0,0,0,0,0,1.5,69.080002,-12,0.0
1,0,0,30,0,0,1.5,69.059998,-12,0.0
2,0,1,0,0,0,1.6,71.779999,-12,0.0
3,0,1,30,0,0,1.6,71.750000,-12,0.0
4,0,2,0,0,0,1.6,75.199997,-12,0.0
...,...,...,...,...,...,...,...,...,...
4795,99,21,30,0,0,1.5,68.779999,3,0.0
4796,99,22,0,0,0,1.6,64.470001,3,0.0
4797,99,22,30,0,0,1.6,69.290001,2,0.0
4798,99,23,0,0,0,1.7,64.849998,2,0.0


In [19]:
# 480개는 10일치입니다.
X = train[:-480]
y = train[-480:]

In [21]:
# lstm에 넣기 위해 스케일링을 합니다.

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

In [26]:
scaled_X = scaler.fit_transform(X)
scaled_y = scaler.fit_transform(y)

In [28]:
# timeseriesGenerator를 씁니다.

from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

In [30]:
# define generator
length = 96 # Length of the output sequences (in number of timesteps)
batch_size = 1 #Number of timeseries samples in each batch
generator = TimeseriesGenerator(scaled_X, # train
                                scaled_X, # test
                                length=length,
                                batch_size=batch_size)

In [31]:
# length 만큼 줄어든 것을 알 수 있습니다.

len(scaled_X), len(generator)

(4320, 4224)