In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

In [2]:
# Data Preprocessing
df = pd.read_csv('dataset_space_API_past_launches_raw.csv')


# Data Clean
df = df.drop(['FlightNumber', 'Date', 'Longitude', 'Latitude'], axis=1) # 删掉没用的


# Process missing Value
PayloadMass_mean = df['PayloadMass'].mean()
PayloadMass_mean = round(PayloadMass_mean, 1)
df['PayloadMass'] = df['PayloadMass'].fillna(PayloadMass_mean)
df['LandingPad'] = df['LandingPad'].fillna('missing')
df['Block'] = df['Block'].fillna('0')


# Data Encoding
df = df.replace({'Outcome': {'True Ocean': 1, 
                             'True RTLS': 1,
                             'True ASDS': 1,
                             'False Ocean':0,
                             'False RTLS':0,
                             'False ASDS':0,
                             'None ASDS':0,
                             'None None':0,},
                 'GridFins': {True: 1, False: 0},
                 'Reused': {True: 1, False: 0},
                 'Legs': {True: 1, False: 0}
                })

dummy_BoosterVersion = pd.get_dummies(df['BoosterVersion'], prefix='BoosterVersion_')
dummy_Orbit = pd.get_dummies(df['Orbit'], prefix='Orbit_')
dummy_LaunchSite = pd.get_dummies(df['LaunchSite'], prefix='LaunchSite_')
dummy_LandingPad = pd.get_dummies(df['LandingPad'], prefix='LandingPad_')
dummy_Serial = pd.get_dummies(df['Serial'], prefix='Serial_')

df = pd.concat([df, dummy_BoosterVersion, dummy_Orbit, dummy_LaunchSite, dummy_LandingPad, dummy_Serial], axis=1)
df = df.drop(['BoosterVersion', 'Orbit', 'LaunchSite', 'LandingPad', 'Serial'], axis=1)

# Split into training data and test data

In [3]:
X = df.drop(['Outcome'], axis=1)
# y = df.drop(df.columns.difference(['Outcome']), axis=1)
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=120, shuffle=True)
print('Size of train set:', X_train.shape[0], '\nSize of test set:', X_test.shape[0])

Size of train set: 65 
Size of test set: 29


# Find best hyperparameters for each Classification methods

In [4]:
from sklearn.model_selection import GridSearchCV

### Decision Tree

In [5]:
from sklearn.tree import DecisionTreeClassifier

param_dt = {
    'max_depth': range(1,5),
    'min_samples_leaf': range(1,5),
    'min_samples_split': range(2,5),
    'max_leaf_nodes': range(2,5)
}

clf = DecisionTreeClassifier(random_state=120)
grid_search = GridSearchCV(estimator=clf, param_grid=param_dt, cv=5)
grid_search.fit(X_train, y_train)

print('Test set score: {:.3f}'.format(grid_search.score(X_test, y_test)))
print('Best score: {:.3f}'.format(grid_search.best_score_))
print('Best params:', grid_search.best_params_)

best_params_dt = grid_search.best_params_

Test set score: 0.862
Best score: 0.877
Best params: {'max_depth': 2, 'max_leaf_nodes': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}


### Logistic Regression

In [6]:
import numpy as np

from sklearn.linear_model import LogisticRegression

param_lr = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
}

clf_lr = LogisticRegression(solver = 'liblinear')

grid_search = GridSearchCV(estimator = clf_lr, param_grid = param_lr, cv = 5)

grid_search.fit(X_train, y_train)

best_params_lr = grid_search.best_params_

print('Test set score: {:.3f}'.format(grid_search.score(X_test, y_test)))
print('Best score on train set: {:.3f}'.format(grid_search.best_score_))
print('Best params:', grid_search.best_params_)

best_params_lr = grid_search.best_params_

Test set score: 0.862
Best score on train set: 0.846
Best params: {'C': 100, 'penalty': 'l2'}


### Support Vector Machine

In [7]:
from sklearn.svm import SVC

param_svm = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.01, 0.1, 1, 10, 100],
}


svc = SVC(random_state=140)

grid_search = GridSearchCV(estimator=svc, param_grid=param_svm, cv=5)
grid_search.fit(X_train, y_train)

print('Test set score: {:.3f}'.format(grid_search.score(X_test, y_test)))
print('Best score on train set: {:.3f}'.format(grid_search.best_score_))
print('Best params:', grid_search.best_params_)

best_params_svm = grid_search.best_params_

Test set score: 0.655
Best score on train set: 0.631
Best params: {'C': 10, 'gamma': 0.1}


### Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=150)

param_rfc = {'n_estimators': [50, 100, 200],
              'max_depth': [5, 10, 20],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4]}

grid_search = GridSearchCV(estimator=rfc, param_grid=param_rfc, cv=5)
grid_search.fit(X_train, y_train)


print('Test set score: {:.3f}'.format(grid_search.score(X_test, y_test)))
print('Best score on train set: {:.3f}'.format(grid_search.best_score_))
print('Best params:', grid_search.best_params_)

best_params_rf = grid_search.best_params_

Test set score: 0.862
Best score on train set: 0.877
Best params: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}


# Find the method performs best using test data

### Decision Tree

In [9]:
clf_dt = DecisionTreeClassifier(max_depth = best_params_dt['max_depth'], 
                               max_leaf_nodes = best_params_dt['max_leaf_nodes'], 
                             min_samples_leaf = best_params_dt['min_samples_leaf'], 
                             min_samples_split = best_params_dt['min_samples_split'], 
                             random_state=120)

clf_dt.fit(X_train, y_train)
y_pred = clf_dt.predict(X_test)

print("Accuracy: {:.3f}".format(accuracy_score(y_test, y_pred)))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print('Precision: {:.3f}'.format(precision_score(y_test, y_pred)), 
      '\nRecall: {:.3f}'.format(recall_score(y_test, y_pred)), 
      '\nF1_score: {:.3f}'.format(f1_score(y_test, y_pred)))

Accuracy: 0.862
Confusion Matrix:
 [[ 8  1]
 [ 3 17]]
Precision: 0.944 
Recall: 0.850 
F1_score: 0.895


### Logistic Regression

In [10]:
clf_lr = LogisticRegression(
                         max_iter = 100, 
                         solver = 'liblinear', 
                         penalty = best_params_lr['penalty'], 
                         C = best_params_lr['C'])

clf_lr.fit(X_train, y_train)
y_pred = clf_lr.predict(X_test)

print("Accuracy: {:.3f}".format(accuracy_score(y_test, y_pred)))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print('Precision: {:.3f}'.format(precision_score(y_test, y_pred)), 
      '\nRecall: {:.3f}'.format(recall_score(y_test, y_pred)), 
      '\nF1_score: {:.3f}'.format(f1_score(y_test, y_pred)))

Accuracy: 0.862
Confusion Matrix:
 [[ 9  0]
 [ 4 16]]
Precision: 1.000 
Recall: 0.800 
F1_score: 0.889


### Support Vector Machine

In [11]:
svc = SVC(random_state = 140,
          kernel = 'linear',
          C = best_params_svm['C'],  
          gamma = best_params_svm['gamma'])
svc.fit(X_train, y_train)

y_pred = svc.predict(X_test)

print("Accuracy: {:.3f}".format(accuracy_score(y_test, y_pred)))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print('Precision: {:.3f}'.format(precision_score(y_test, y_pred)), 
      '\nRecall: {:.3f}'.format(recall_score(y_test, y_pred)), 
      '\nF1_score: {:.3f}'.format(f1_score(y_test, y_pred)))

Accuracy: 0.828
Confusion Matrix:
 [[ 9  0]
 [ 5 15]]
Precision: 1.000 
Recall: 0.750 
F1_score: 0.857


### Random Forest

In [12]:
rfc = RandomForestClassifier(n_estimators = best_params_rf['n_estimators'],
                             max_depth = best_params_rf['max_depth'],
                             min_samples_split = best_params_rf['min_samples_split'],
                             min_samples_leaf = best_params_rf['min_samples_leaf'])

rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

print("Accuracy: {:.3f}".format(accuracy_score(y_test, y_pred)))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print('Precision: {:.3f}'.format(precision_score(y_test, y_pred)), 
      '\nRecall: {:.3f}'.format(recall_score(y_test, y_pred)), 
      '\nF1_score: {:.3f}'.format(f1_score(y_test, y_pred)))

Accuracy: 0.862
Confusion Matrix:
 [[ 9  0]
 [ 4 16]]
Precision: 1.000 
Recall: 0.800 
F1_score: 0.889
