In [59]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Read the CSV and Perform Basic Data Cleaning

In [60]:
df = pd.read_csv("exoplanet_data.csv")
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()


# Select your features (columns)

In [61]:
# Set features. This will also be used as your x values.
#selected_features_1 = df[["koi_period","koi_time0bk","koi_impact","koi_duration","koi_depth","koi_prad",
#                        "koi_teq","koi_insol","koi_model_snr","koi_tce_plnt_num","koi_steff","koi_slogg",
#                        "koi_srad","ra","dec","koi_kepmag"]]
selected_features_1 = df.loc[:, df.columns != 'koi_disposition']

In [62]:
# Set features. This will also be used as your x values.
selected_features_2 = df[["koi_impact","koi_duration","koi_depth","ra","koi_kepmag"]]

# Create a Train Test Split

Use `koi_disposition` for the y values

In [63]:
X1 = selected_features_1
X2 = selected_features_2

In [64]:
#SWITCH STRING TO NUMBER FOR Y
df.loc[df["koi_disposition"]=="CONFIRMED"]=0
df.loc[df["koi_disposition"]=="FALSE POSITIVE"]=1
df.loc[df["koi_disposition"]=="CANDIDATE"]=2

In [65]:
y = df["koi_disposition"].values.reshape(-1,1)


In [66]:
print(X1.shape, y.shape)
print(X2.shape, y.shape)

(6991, 40) (6991, 1)
(6991, 5) (6991, 1)


In [67]:
from sklearn.model_selection import train_test_split

#X1 = pd.get_dummies(X1)
#X2 = pd.get_dummies(X2)
#y = pd.get_dummies(y)



In [68]:
#Feature 1

X1_train, X1_test, y_train, y_test = train_test_split(X1, y, random_state=42)

print(X1_train.shape, y_train.shape)
print(X1_test.shape, y_test.shape)

(5243, 40) (5243, 1)
(1748, 40) (1748, 1)


In [69]:
#Feature 2

X2_train, X2_test, y_train, y_test = train_test_split(X2, y, random_state=42)

X2_train.head()

Unnamed: 0,koi_impact,koi_duration,koi_depth,ra,koi_kepmag
6122,0.15,3.616,123.1,294.40472,14.725
6370,0.291,2.309,114.6,284.50391,15.77
2879,0.97,79.8969,641.1,295.50211,13.099
107,0.3,2.6312,875.4,291.15878,15.66
29,0.831,2.22739,9802.0,292.16705,15.263


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [70]:
#MinMaxScaler- Feature 1
from sklearn.preprocessing import MinMaxScaler
X1_minmax = MinMaxScaler().fit(X1)
y_minmax = MinMaxScaler().fit(y)

X1_train_minmax = X1_minmax.transform(X1_train)
X1_test_minmax = X1_minmax.transform(X1_test)
y_train_minmax = y_minmax.transform(y_train)
y_test_minmax = y_minmax.transform(y_test)


In [71]:
#MinMaxScaler- Feature 2
from sklearn.preprocessing import MinMaxScaler
X2_minmax = MinMaxScaler().fit(X2_train)
y_minmax = MinMaxScaler().fit(y_train)

X2_train_minmax = X2_minmax.transform(X2_train)
X2_test_minmax = X2_minmax.transform(X2_test)
y_train_minmax = y_minmax.transform(y_train)
y_test_minmax = y_minmax.transform(y_test)

# Train the Model



In [72]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [73]:
### BEGIN SOLUTION- Feature 1
model.fit(X1_train_minmax, y_train_minmax)
training_score_1 = model.score(X1_train_minmax, y_train_minmax)
testing_score_1 = model.score(X1_test_minmax, y_test_minmax)
### END SOLUTION 


In [74]:
print(f"Training Score: {training_score_1}")
print(f"Testing Score: {testing_score_1}")

Training Score: 0.10138681286706097
Testing Score: 0.08165028294908339


In [75]:
### BEGIN SOLUTION- Feature 2
model.fit(X2_train_minmax, y_train_minmax)
training_score_2 = model.score(X2_train_minmax, y_train_minmax)
testing_score_2 = model.score(X2_test_minmax, y_test_minmax)
### END SOLUTION 

In [76]:
print(f"Training Score: {training_score_2}")
print(f"Testing Score: {testing_score_2}")

Training Score: 0.0035717383547523296
Testing Score: 0.005986191897429438


# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [77]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [78]:
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

ct = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(), make_column_selector(dtype_include=object))
)

In [79]:
# Split the data into training and testing

# Data Feature one with Feature 1
### BEGIN SOLUTION
from sklearn.model_selection import train_test_split

X1_train, X1_test, y_train, y_test = train_test_split(X1, y, random_state=42)
### END SOLUTION

In [80]:
# Train the model with GridSearch for Feature 1
from sklearn.model_selection import GridSearchCV

def run_grid_search(name, model, alpha_params=[0.0001, 0.001, 0.01, 0.1, 1.0, 10.0], verbose=3):
    param_grid = {f"{name.lower()}__alpha": alpha_params}
    grid = GridSearchCV(model, param_grid, verbose=verbose)
    grid.fit(X1_train, y_train)
    return grid

models_1 = [LinearRegression(), ElasticNet(alpha=.01), Ridge(alpha=.01), Lasso(alpha=.01)]
for model in models_1:
    name = type(model).__name__
    if name.lower() == "linearregression":
        continue
    model = make_pipeline(ct, model).fit(X1_train, y_train)
    grid_1 = run_grid_search(name, model, verbose=0) # 0- quit, 3 - verbose
    print(grid_1.best_params_, grid_1.best_score_)

{'elasticnet__alpha': 0.01} 0.09121280475209761
{'ridge__alpha': 10.0} 0.0462417767937777
{'lasso__alpha': 0.01} 0.08957093186817253


In [81]:
# Data Feature one with Feature 2
### BEGIN SOLUTION
from sklearn.model_selection import train_test_split

X2_train, X2_test, y_train, y_test = train_test_split(X2, y, random_state=42)
### END SOLUTION

In [82]:
# Train the model with GridSearch for Feature 2
from sklearn.model_selection import GridSearchCV

def run_grid_search(name, model, alpha_params=[0.0001, 0.001, 0.01, 0.1, 1.0, 10.0], verbose=3):
    param_grid = {f"{name.lower()}__alpha": alpha_params}
    grid = GridSearchCV(model, param_grid, verbose=verbose)
    grid.fit(X2_train, y_train)
    return grid

models = [LinearRegression(), ElasticNet(alpha=.01), Ridge(alpha=.01), Lasso(alpha=.01)]
for model in models:
    name = type(model).__name__
    if name.lower() == "linearregression":
        continue
    model = make_pipeline(ct, model).fit(X2_train, y_train)
    grid_2 = run_grid_search(name, model, verbose=0) # 0- quit, 3 - verbose
    print(grid_2.best_params_,grid_2.best_score_)

{'elasticnet__alpha': 0.01} 0.0017687182021199678
{'ridge__alpha': 10.0} 0.001732727107303611
{'lasso__alpha': 0.001} 0.00173354136581505


# Save the Model

In [83]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename_1 ="yin_cai_1.sav"
filename = 'yin_cai.sav'
joblib.dump(models_1, filename_1)
joblib.dump(models, filename)

['yin_cai.sav']