# 1. 라이브러리 및 데이터

In [5]:
from platform import python_version
import pandas as pd
import numpy as np 

print(f'python {python_version()}')
print(f'pandas {pd.__version__}')
print(f'numpy {np.__version__}')

python 3.6.10
pandas 1.0.3
numpy 1.18.4


In [3]:
# 데이터 불러오기
train_features = pd.read_csv('./original/train_features.csv')
train_target = pd.read_csv('./original/train_target.csv', index_col = 'id')
test_features = pd.read_csv('./original/test_features.csv')

In [6]:
# 데이터 형태 확인
print(f'train_features {train_features.shape}')
print(f'train_target {train_target.shape}')
print(f'test_features {test_features.shape}')

train_features (1050000, 6)
train_target (2800, 4)
test_features (262500, 6)


In [12]:
display(train_features.head())
display(train_target.head())

Unnamed: 0,id,Time,S1,S2,S3,S4
0,0,0.0,0.0,0.0,0.0,0.0
1,0,4e-06,0.0,0.0,0.0,0.0
2,0,8e-06,0.0,0.0,0.0,0.0
3,0,1.2e-05,0.0,0.0,0.0,0.0
4,0,1.6e-05,0.0,0.0,0.0,0.0


Unnamed: 0_level_0,X,Y,M,V
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.0,-400.0,50.0,0.4
1,400.0,0.0,100.0,1.0
2,-300.0,-200.0,25.0,0.4
3,200.0,-100.0,150.0,0.4
4,-300.0,-100.0,150.0,0.4


# 2. 데이터 전처리

In [13]:
def preprocessing_KAERI(data) :
    '''
    data: train_features.csv or test_features.csv
    
    return: Random Forest 모델 입력용 데이터
    '''
    
    # 충돌체 별로 0.000116 초 까지의 가속도 데이터만 활용해보기 
    _data = data.groupby('id').head(30)
    
    # string 형태로 변환
    _data['Time'] = _data['Time'].astype('str')
    
    # Random Forest 모델에 입력 할 수 있는 1차원 형태로 가속도 데이터 변환
    _data = _data.pivot_table(index = 'id', columns = 'Time', values = ['S1', 'S2', 'S3', 'S4'])
    
    # column 명 변환
    _data.columns = ['_'.join(col) for col in _data.columns.values]
    
    return _data

In [32]:
train_features = preprocessing_KAERI(train_features)
test_features = preprocessing_KAERI(test_features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [14]:
print(f'train_features {train_features.shape}')
print(f'test_features {test_features.shape}')

train_features (1050000, 6)
test_features (262500, 6)


# 3. 탐색적 자료분석

# 4. 변수 선택 및 모델 구축

In [24]:
import sklearn
from sklearn.ensemble import RandomForestRegressor
print(f'sklearn {sklearn.__version__}')

sklearn 0.23.1


In [30]:
# 재생산성을 위한 시드 고정 및 multi cpu 활용을 위한 n_jobs = -1 설정
model = RandomForestRegressor(n_jobs=-1, random_state=0)

# 5. 모델 학습 및 검증

In [33]:
# 모델 학습 (fit)
model.fit(train_features, train_target)

RandomForestRegressor(n_jobs=-1, random_state=0)

In [34]:
# 예측 (predict)
y_pred = model.predict(test_features)

In [35]:
y_pred

array([[-2.5200e+02, -4.8000e+01,  7.6250e+01,  4.7400e-01],
       [ 3.0300e+02, -3.0500e+02,  4.0750e+01,  5.3800e-01],
       [-3.0500e+02,  1.2900e+02,  1.1075e+02,  3.6400e-01],
       ...,
       [ 3.9000e+02, -3.3400e+02,  1.1625e+02,  3.7600e-01],
       [ 1.9700e+02, -3.6300e+02,  6.1750e+01,  4.2800e-01],
       [ 6.5000e+01,  1.9900e+02,  1.2225e+02,  4.3200e-01]])

In [37]:
# 답안지 불러오기
submit = pd.read_csv('./original/sample_submission.csv')

submit.head()

Unnamed: 0,id,X,Y,M,V
0,2800,0,0,0,0
1,2801,0,0,0,0
2,2802,0,0,0,0
3,2803,0,0,0,0
4,2804,0,0,0,0


In [38]:
# 답안지에 옮겨 적기
for i in range(4):
    submit.iloc[:,i+1] = y_pred[:,i]

In [39]:
submit.head()

Unnamed: 0,id,X,Y,M,V
0,2800,-252.0,-48.0,76.25,0.474
1,2801,303.0,-305.0,40.75,0.538
2,2802,-305.0,129.0,110.75,0.364
3,2803,108.0,77.0,106.75,0.434
4,2804,-147.0,159.0,90.75,0.474


In [40]:
submit.to_csv('./result/Dacon_baseline.csv', index = False)

# 6. 결과 및 결언