In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from astropy.table import Table
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## get data

In [None]:
#get data
data_df = pd.read_csv('data/RF_data/mydata.csv')
data_labels = np.load('data/RF_data/mydata_labels.npy')
#0 = invalid, 1 = valid

In [None]:
data_df

# FIRST RF classifier-------------------------------------------------------
## uses all features

In [None]:
#format data

# luckily this kepler data has no nans and is all numeric...
#...b/c we cleaned it beforehand

In [None]:
# split into train/test datasets
X_train, X_test, y_train, y_test = train_test_split(data_df, data_labels,  random_state=123456)

In [None]:
# setup RF model
forest = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=123456)
# train RF model
forest.fit(X_train, y_train)

In [None]:
# make predicitons 
predicted = forest.predict(X_test)
# check accuracy of RF
accuracy = accuracy_score(y_test, predicted)
print(accuracy)

In [None]:
#plot relative importances of features

#determine importances
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")
for f in range(data_df.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(data_df.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(data_df.shape[1]), data_df.columns[indices])
plt.xlim([-1, data_df.shape[1]])
plt.show()

# SECOND RF classifier-------------------------------------------------------
## Remove weight feature & test again

In [None]:
#format data

# try remove w
data_df = data_df.drop('w', 1)
data_df

In [None]:
# split into train/test datasets
X_train, X_test, y_train, y_test = train_test_split(data_df, data_labels,  random_state=123456)

In [None]:
# setup RF model 
forest = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=123456)
# train RF model
forest.fit(X_train, y_train)

In [None]:
# make predicitons 
predicted = forest.predict(X_test)
# check accuracy of RF
accuracy = accuracy_score(y_test, predicted)
print(accuracy)

In [None]:
#plot relative importances of features

#determine importances
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")
for f in range(data_df.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(data_df.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(data_df.shape[1]), data_df.columns[indices])
plt.xlim([-1, data_df.shape[1]])
plt.show()

# THIRD RF classifier-------------------------------------------------------
## Inject Random Number & test again

In [None]:
#format data

# inject random number as a test
np.random.seed(seed = 42)
data_df['random'] = np.random.random(size = len(data_df))

In [None]:
# split into train/test datasets
X_train, X_test, y_train, y_test = train_test_split(data_df, data_labels,  random_state=123456)

In [None]:
# setup RF model 
forest = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=123456)
# train RF model
forest.fit(X_train, y_train)

In [None]:
# make predicitons 
predicted = forest.predict(X_test)
#check accuracy of RF
accuracy = accuracy_score(y_test, predicted)
print(accuracy)

In [None]:
#plot relative importances of features

#determine importances
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")
for f in range(data_df.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(data_df.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(data_df.shape[1]), data_df.columns[indices])
plt.xlim([-1, data_df.shape[1]])
plt.show()

# FOURTH RF classifier-------------------------------------------------------
## trying to extend to our TESS data - not entirely apples to apples but let's see what happens

In [None]:
# remove all features we don't have

data_df = 
data_df

In [None]:
# split into train/test datasets
X_train, X_test, y_train, y_test = train_test_split(data_df, data_labels,  random_state=123456)

In [None]:
# setup RF model 
forest = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=123456)
# train RF model
forest.fit(X_train, y_train)

In [None]:
# make predicitons 
predicted = forest.predict(X_test)
#check accuracy of RF
accuracy = accuracy_score(y_test, predicted)
print(accuracy)

In [None]:
#plot relative importances of features

#determine importances
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")
for f in range(data_df.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(data_df.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(data_df.shape[1]), data_df.columns[indices])
plt.xlim([-1, data_df.shape[1]])
plt.show()

# test with our data-------------------------------------------------------
## format data

In [None]:
# load data
ourdata = pd.read_csv('data/RF_data/intern_data_for_RF.csv')

In [None]:
# drop tic column
ourdata = ourdata.drop('tic',1)
# remove nans in our data
ourdata = ourdata.dropna()
#inject random value
np.random.seed(seed = 42)
ourdata['random'] = np.random.random(size = len(ourdata))

ourdata

## test our data with trained RF

In [None]:
predicted = forest.predict(ourdata)
our_accuracy = np.histogram(predicted,bins = np.arange(0,3,1))
our_accuracy 