In [68]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import missingno as msno
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
# Allows us to split our data into training and testing data
from sklearn.model_selection import train_test_split
# Allows us to test parameters of classification algorithms and find the best one
from sklearn.model_selection import GridSearchCV
# Logistic Regression classification algorithm
from sklearn.linear_model import LogisticRegression
# Support Vector Machine classification algorithm
from sklearn.svm import SVC
# Decision Tree classification algorithm
from sklearn.tree import DecisionTreeClassifier
# K Nearest Neighbors classification algorithm
from sklearn.neighbors import KNeighborsClassifier


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [69]:
def plot_confusion_matrix(y,y_predict):
    "this function plots the confusion matrix"
    from sklearn.metrics import confusion_matrix

    cm = confusion_matrix(y, y_predict)
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax); #annot=True to annotate cells
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.set_title('Confusion Matrix'); 
    ax.xaxis.set_ticklabels(['did not land', 'land']); ax.yaxis.set_ticklabels(['did not land', 'landed'])

# Process
1. 데이터셋 확인(null data 확인)
2. EDA(Exploratory Data Analysis)
3. Feature Engineering(one-hot encoding, class 나누기, 구간으로 나누기 등)
4. Model 만들기

# 1. 데이터셋 확인(null data 확인)

In [70]:
df = pd.read_csv("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DS0321EN-SkillsNetwork/datasets/dataset_part_1.csv")
df.head(10)

In [71]:
df.describe()

**1.1 Null data 확인**

In [72]:
for column in df.columns:
    msg = 'column: {:>10}\t Percent of NaN Value: {:.2f}%'.format(column, 100 * (df[column].isnull().sum()) / df[column].shape[0])
    print(msg)

df.isnull().sum()/df.count()*100

- missingno 라이브러리 통해 더 쉽게 null data 확인

In [73]:
msno.matrix(df=df.iloc[:,:],figsize=(7,7),color=(0.3,0.3,0.3))

In [74]:
msno.bar(df=df.iloc[:,:],figsize=(7,7),color=(0.3,0.3,0.3))

**1.2 Target label 확인**
- Outcome 의미
    - True Ocean : 특정 바다에 성공적으로 착륙
    - False Ocean : 특정 바다에 착륙 실패
    - True RTLS : 특정 Ground Pad에 성공적으로 착륙
    - False RTLS : 특정 Ground Pad에 착륙 실패
    - True ASDS : 드론쉽에 성공적으로 착륙
    - False ASDS : 드론쉽에 착륙 실패
    - None ASDS, None None : 착륙 실패

In [75]:
landing_outcomes = df.Outcome.value_counts()
print(landing_outcomes)

In [76]:
for i,outcome in enumerate(landing_outcomes.keys()):
    print(i,outcome)

In [77]:
bad_outcomes = set(landing_outcomes.keys()[[1,3,5,6,7]])
bad_outcomes

- Outcome을 성공과 실패로만 나누어 작업

In [78]:
landing_class=[]
for outcome in (df['Outcome']):
    if outcome in bad_outcomes:
        landing_class.append(0)
    else:
        landing_class.append(1)

In [79]:
df['Class'] = landing_class
df[['Class']].head(10)

# 2. EDA

**2.1 FlightNumber vs Payload**

In [80]:
sns.catplot(y="PayloadMass",x="FlightNumber",hue="Class",data=df,aspect=5)
plt.xlabel("Flight Number",fontsize=22)
plt.ylabel("Pay load Mass (kg)",fontsize=22)
plt.show()

**2.2 FlightNumber vs LaunchSite**

In [81]:
sns.catplot(y="LaunchSite",x="FlightNumber",hue="Class",data=df,aspect=5)
plt.xlabel("Flight Number",fontsize=22)
plt.ylabel("Launch Site",fontsize=22)
plt.show()

**2.3 Payload vs LaunchSite**

In [82]:
sns.catplot(y="LaunchSite",x="PayloadMass",hue="Class",data=df,aspect=5)
plt.xlabel("Payload Mass (KG)",fontsize=22)
plt.ylabel("Launch Site",fontsize=22)
plt.show()

- 탑재량이 많은 경우 CCAFS SLC 40 혹은 KSC LC 39A 에서 발사함
- CCAFS SLC 40에서 발사하는 로켓 중, 8000Kg 미만을 탑재하는 경우 더 높은 실패율을 보인다

**2.4 Orbit에 따른 성공률**

In [83]:
df.groupby("Orbit").mean()["Class"].plot(kind='bar')
plt.xlabel("Orbit Type",fontsize=20)
plt.ylabel("Success Rate",fontsize=20)
plt.show()

**2.5 FlightNumber vs Orbit Type**

In [84]:
sns.catplot(y="Orbit",x="FlightNumber",hue="Class",aspect=5,data=df)
plt.xlabel("Flight Number",fontsize=20)
plt.ylabel("Orbit Type",fontsize=20)
plt.show()

- LEO의 경우 횟수와 성공이 비례
- GTO의 경우 횟수와 성공이 무관

**2.6 Orbit Type vs Payload**

In [85]:
sns.catplot(y="Orbit",x="PayloadMass",hue="Class",data=df,aspect=5)
plt.ylabel("Orbit type",fontsize=20)
plt.xlabel("Pay load (Kg)")
plt.show()

- GTO 궤도의 경우, 적재량이 많을수록 성공률이 떨어짐

**2.7 년도와 성공률과의 관계**

- 현재 갖고있는 날짜 데이터에서 년도에 해당하는 부분만 추출

In [86]:
years = []
def Extract_year(date):
    for i in df["Date"]:
        years.append(i.split("-")[0])
    return years

In [87]:
df1 = pd.DataFrame(Extract_year(df['Date']), columns = ['year'])
df1['Class'] = df['Class']
sns.lineplot(x = np.unique(Extract_year(df['Date'])), y = df1.groupby('year')['Class'].mean())
plt.xlabel("Year", fontsize = 20)
plt.ylabel("Success Rate", fontsize = 20)
plt.show()

# 3. Feature Engineering
- 각 feature가 성공률에 얼마나 큰 영향을 끼치는지 확인하고, 후에 모델에 사용

In [88]:
features = df[['FlightNumber','PayloadMass','Orbit','LaunchSite','Flights','GridFins','Reused','Legs','LandingPad','Block','ReusedCount','Serial']]
features.head()

**3.1 One hot encoding**

In [89]:
features_one_hot = pd.get_dummies(features, columns = ['Orbit','LaunchSite','LandingPad','Serial'])
features_one_hot.head()

In [90]:
features_one_hot.astype('float64')

# **4. Building machine learning model and prediction using the trained model**

- 기존 df에 있는 Class feature에서 새로운 numpy array를 만든다

In [91]:
Y = df['Class'].to_numpy()
Y

In [92]:
transform = preprocessing.StandardScaler()
X = transform.fit_transform(features_one_hot)

- train_test_split 이용하여 트레이닝 데이터와 테스트 데이터 나누기

In [93]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2, random_state=2)
print('Train set: ',X_train.shape, Y_train.shape)
print('Test set: ',X_test.shape, Y_test.shape)

- GridSearchCV 구현

In [94]:
parameters ={'C':[0.01,0.1,1],
             'penalty':['l2'],
             'solver':['lbfgs']}
lr=LogisticRegression()


In [95]:
logreg_cv = GridSearchCV(lr,parameters,cv=10)
logreg_cv.fit(X_train, Y_train)

In [96]:
print("best parameters :  ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

In [97]:
print('test set accuracy : ',logreg_cv.score(X_test, Y_test))

In [98]:
yhat = logreg_cv.predict(X_test)
plot_confusion_matrix(Y_test,yhat)

- 착륙할 것이라고 예측했으나 실제로는 그러지 못한 케이스 3건.

- 서포트 벡터 머신을 만들기 위해 새롭게 파라미터를 정의하여 같은 방식으로 돌린다

In [99]:
parameters = {'kernel':('linear','rbf','poly','rbf','sigmoid'),'C':np.logspace(-3,3,5),'gamma':np.logspace(-3,3,5)}
svm = SVC()

In [100]:
svm_cv = GridSearchCV(svm, parameters, cv=10)
svm_cv.fit(X_train, Y_train)

In [101]:
print("best parameters : ",logreg_cv.best_params_)
print("accuracy : ",logreg_cv.best_score_)
print("test set accuracy : ",logreg_cv.score(X_test, Y_test))

In [102]:
yhat = logreg_cv.predict(X_test)
plot_confusion_matrix(Y_test,yhat)

- 결정 트리 classifier를 사용하여 테스트

In [103]:
parameters = {'criterion': ['gini','entropy'],
             'splitter':['best','random'],
             'max_depth':[2 * n for n in range(1,10)],
             'max_features':['auto','sqrt'],
             'min_samples_leaf':[1,2,4],
             'min_samples_split':[2,5,10]}
tree = DecisionTreeClassifier()

In [104]:
tree_cv = GridSearchCV(tree, parameters, cv = 10)
tree_cv.fit(X_train, Y_train)

In [105]:
print("best parameters : ",tree_cv.best_params_)
print("accuracy : ",tree_cv.best_score_)
print("test set accuracy : ",tree_cv.score(X_test,Y_test))

In [106]:
yhat = svm_cv.predict(X_test)
plot_confusion_matrix(Y_test,yhat)

- KNN 모델

In [107]:
parameters = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
              'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
              'p': [1,2]}

KNN = KNeighborsClassifier()

In [108]:
knn_cv = GridSearchCV(KNN,parameters,cv=10)
knn_cv.fit(X_train, Y_train)

In [109]:
print("best parameters : ",knn_cv.best_params_)
print("accuracy : ",knn_cv.best_score_)
print("test set accuracy : ",knn_cv.score(X_test,Y_test))

In [110]:
yhat = knn_cv.predict(X_test)
plot_confusion_matrix(Y_test,yhat)

- 위의 모든 방법들로 테스트 한 결과, Decision Tree Classification의 방법이 가장 높은 정확도를 보였다