# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib

In [1]:
import pandas as pd

# Read the CSV and Perform Basic Data Cleaning

In [2]:
df = pd.read_csv("dataset.csv")
#df.head()

In [3]:
# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')
# Drop the null rows
df = df.dropna()

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2269 entries, 0 to 9540
Data columns (total 48 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   rowid              2269 non-null   int64  
 1   kepid              2269 non-null   int64  
 2   kepoi_name         2269 non-null   object 
 3   kepler_name        2269 non-null   object 
 4   koi_disposition    2269 non-null   object 
 5   koi_pdisposition   2269 non-null   object 
 6   koi_score          2269 non-null   float64
 7   koi_fpflag_nt      2269 non-null   int64  
 8   koi_fpflag_ss      2269 non-null   int64  
 9   koi_fpflag_co      2269 non-null   int64  
 10  koi_fpflag_ec      2269 non-null   int64  
 11  koi_period         2269 non-null   float64
 12  koi_period_err1    2269 non-null   float64
 13  koi_period_err2    2269 non-null   float64
 14  koi_time0bk        2269 non-null   float64
 15  koi_time0bk_err1   2269 non-null   float64
 16  koi_time0bk_err2   2269 

# Select your features (columns)

In [5]:
# Set features. This will also be used as your x values.
selected_features = df[['kepid', 'koi_slogg', 'koi_steff','koi_srad','koi_kepmag',#'koi_time0bk',
                         'koi_duration', 'koi_depth', 'koi_prad',
                        #'koi_teq', 'koi_insol', 'koi_model_snr', 'ra', 'dec'
                       ]]

selected_features.to_csv("output.csv")

selected_features.head()

Unnamed: 0,kepid,koi_slogg,koi_steff,koi_srad,koi_kepmag,koi_duration,koi_depth,koi_prad
0,10797460,4.467,5455.0,0.927,15.347,2.9575,615.8,2.26
1,10797460,4.467,5455.0,0.927,15.347,4.507,874.8,2.83
4,10854555,4.438,6031.0,1.046,15.509,1.6545,603.3,2.75
5,10872983,4.486,6046.0,0.972,15.714,4.5945,1517.5,3.9
6,10872983,4.486,6046.0,0.972,15.714,3.1402,686.0,2.77


# Create a Train Test Split

Use `koi_disposition` for the y values

In [6]:
df['koiDisposition'] = df['koi_disposition'].apply(lambda x: 3 if x == 'CONFIRMED' else 2 if x == 'CANDIDATE' else 1 )

In [7]:
target = df['koiDisposition']
target_names = ['CONFIRMED','FALSE POSITIVE','CANDIDATE']

In [8]:
print(selected_features.shape, target.shape)

(2269, 8) (2269,)


In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(selected_features, target, random_state=42)

In [None]:
y_train.head()

# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [None]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the Model



In [10]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
#clf.score(X_test, y_test)

In [11]:
print(f"Training Data Score: {clf.score(X_train, y_train)}")
print(f"Testing Data Score: {clf.score(X_test, y_test)}")

Training Data Score: 1.0
Testing Data Score: 1.0


In [13]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=10)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

1.0

In [14]:
sorted(zip(rf.feature_importances_, selected_features), reverse=True)

[(0.40486675659984656, 'koi_depth'),
 (0.3475040829052464, 'koi_prad'),
 (0.20839040616964732, 'koi_duration'),
 (0.03923875432525964, 'koi_steff'),
 (0.0, 'koi_srad'),
 (0.0, 'koi_slogg'),
 (0.0, 'koi_kepmag'),
 (0.0, 'kepid')]

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [15]:
# Create the GridSearchCV model
# Create the SVC Model
from sklearn.svm import SVC 
model = SVC(kernel='linear')
model

SVC(kernel='linear')

In [16]:
# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10, 50],
              'gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [None]:
# Train the model with GridSearch
grid.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ..................... C=1, gamma=0.0001, score=nan, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
Traceback (most recent call last):
  File "C:\Users\jegan\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\jegan\anaconda3\lib\site-packages\sklearn\svm\_base.py", line 164, in fit
    y = self._validate_targets(y)
  File "C:\Users\jegan\anaconda3\lib\site-packages\sklearn\svm\_base.py", line 549, in _validate_targets
    raise ValueError(
ValueError: The number of classes has to be greater than one; got 1 class

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s


[CV] ................... C=1, gamma=0.0001, score=0.997, total=   0.7s
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.6s remaining:    0.0s


In [None]:
print(grid2.best_params_)
print(grid2.best_score_)

# Save the Model

In [None]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'your_name.sav'
joblib.dump(your_model, filename)