# 로켓 발사 모델 구축
로켓 발사를 결정하는 의사결정 나무(Decision Tree) 모델

In [194]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt

In [195]:
# 파일 불러오기
rocketLaunch_df = pd.read_csv(r"data/RocketLaunchDataCSV.csv") 

## 데이터 전처리:
 ‘Launched?’ 열을 타겟 변수로 설정하고,
  온도, 강수량, 풍향, 풍속 등의 날씨 데이터를 특성 변수로 사용합니다. 
  이때, 범주형 데이터는 숫자로 변환해야 합니다.
   또한, 결측치는 적절한 방법으로 처리해야 합니다.

In [196]:
rocketLaunch_df.info()
# 결측치 처리해주자 - 어떻게 처리 할까

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 26 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Name                          60 non-null     object 
 1   Date                          300 non-null    object 
 2   Time (East Coast)             59 non-null     object 
 3   Location                      300 non-null    object 
 4   Crewed or Uncrewed            60 non-null     object 
 5   Launched?                     60 non-null     object 
 6   High Temp                     299 non-null    float64
 7   Low Temp                      299 non-null    float64
 8   Ave Temp                      299 non-null    float64
 9   Temp at Launch Time           59 non-null     float64
 10  Hist High Temp                299 non-null    float64
 11  Hist Low Temp                 299 non-null    float64
 12  Hist Ave Temp                 299 non-null    float64
 13  Perci

In [197]:
print(rocketLaunch_df.columns)

Index(['Name', 'Date', 'Time (East Coast)', 'Location', 'Crewed or Uncrewed',
       'Launched?', 'High Temp', 'Low Temp', 'Ave Temp', 'Temp at Launch Time',
       'Hist High Temp', 'Hist Low Temp', 'Hist Ave Temp',
       'Percipitation at Launch Time', 'Hist Ave Percipitation',
       'Wind Direction', 'Max Wind Speed', 'Visibility',
       'Wind Speed at Launch Time', 'Hist Ave Max Wind Speed',
       'Hist Ave Visibility', 'Sea Level Pressure',
       'Hist Ave Sea Level Pressure', 'Day Length', 'Condition', 'Notes'],
      dtype='object')


In [198]:
# rocketLaunch_df 에서 결측치 많은 열들 삭제
rocketLaunch_df.drop(['Name',  'Date'   ,  'Location',                
                      'Time (East Coast)',         
                      'Crewed or Uncrewed',         
                      'Temp at Launch Time',      
                      'Wind Speed at Launch Time',  
                      'Hist Ave Max Wind Speed',  
                      'Hist Ave Visibility',        
                      'Hist Ave Sea Level Pressure','Notes',
                      'Hist Ave Temp',
                      'Hist High Temp',                   
                      'Hist Low Temp',         
                      'High Temp',         
                      'Low Temp',      
                      'Hist Ave Percipitation'], axis=1, inplace=True)

# rocketLaunch_df 에서 Hist 열들도 삭제


In [199]:
rocketLaunch_df['Launched?'].fillna('N', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  rocketLaunch_df['Launched?'].fillna('N', inplace=True)


In [200]:
rocketLaunch_df['Launched?'].value_counts()

Launched?
N    241
Y     59
Name: count, dtype: int64

In [205]:
# rocketLaunch_df.describe()
rocketLaunch_df.head()
#rocketLaunch_df.info()

# Date: 로켓 발사 날짜
# Location: 로켓 발사 위치
# Launched?: 로켓이 성공적으로 발사되었는지 여부
# Ave Temp: 발사 당일의 평균 기온
# Percipitation at Launch Time: 발사 시각의 강수량
# Wind Direction: 발사 당일의 풍향
# Max Wind Speed: 발사 당일의 최대 풍속
# Visibility: 발사 당일의 가시성
# Sea Level Pressure: 발사 당일의 해수면 기압
# Day Length: 발사 당일의 낮의 길이
# Condition: 발사 당일의 날씨 상태

Unnamed: 0,Launched?,Ave Temp,Percipitation at Launch Time,Wind Direction,Max Wind Speed,Visibility,Sea Level Pressure,Day Length,Condition
0,N,71.0,0.0,0,16.0,15.0,38,4,0
1,N,73.39,0.0,0,14.0,10.0,36,4,0
2,Y,60.21,0.0,2,15.0,10.0,41,3,0
3,N,66.04,0.0,1,10.0,10.0,44,3,6
4,N,70.52,0.0,0,12.0,10.0,39,87,6


In [201]:
rocketLaunch_df.isnull().sum()

Launched?                       0
Ave Temp                        1
Percipitation at Launch Time    1
Wind Direction                  1
Max Wind Speed                  1
Visibility                      1
Sea Level Pressure              1
Day Length                      2
Condition                       2
dtype: int64

In [202]:
# 범주형 데이터 인코딩.. 

from sklearn.preprocessing import LabelEncoder

# Create a label encoder object
label_encoder = LabelEncoder()

rocketLaunch_df['Wind Direction'] = label_encoder.fit_transform(rocketLaunch_df['Wind Direction'])
rocketLaunch_df['Condition'] = label_encoder.fit_transform(rocketLaunch_df['Condition'])
rocketLaunch_df['Sea Level Pressure'] = label_encoder.fit_transform(rocketLaunch_df['Sea Level Pressure'])
rocketLaunch_df['Day Length'] = label_encoder.fit_transform(rocketLaunch_df['Day Length'])

# rocketLaunch_df['Date'] = pd.to_datetime(rocketLaunch_df['Date'])
rocketLaunch_df['Wind Direction'] = label_encoder.fit_transform(rocketLaunch_df['Wind Direction'])



In [203]:
rocketLaunch_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Launched?                     300 non-null    object 
 1   Ave Temp                      299 non-null    float64
 2   Percipitation at Launch Time  299 non-null    float64
 3   Wind Direction                300 non-null    int64  
 4   Max Wind Speed                299 non-null    float64
 5   Visibility                    299 non-null    float64
 6   Sea Level Pressure            300 non-null    int64  
 7   Day Length                    300 non-null    int64  
 8   Condition                     300 non-null    int64  
dtypes: float64(4), int64(4), object(1)
memory usage: 21.2+ KB


In [204]:
# 의사결정 나무 

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# 데이터프레임에서 특성과 타겟을 선택
features = rocketLaunch_df.drop('Launched?', axis=1)
target = rocketLaunch_df['Launched?']

# 데이터를 학습 세트와 테스트 세트로 분할
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# 모델을 생성하고 학습
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# 예측을 수행하고 모델을 평가
y_pred = model.predict(X_test)
print('정확도:', accuracy_score(y_test, y_pred))


정확도: 0.7166666666666667
