In [None]:
#This is a notebook to run a simple binary classification algorithm, using Decision Trees.

#Author: Viviana Acquaviva
#License: BSD but really should be TBD - just be nice.

import pandas as pd
import numpy as np
import sklearn.tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics 
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus
from sklearn.tree import export_graphviz

#Notes: 

#Data come from here
#from astroML.datasets import fetch_rrlyrae_combined
#X, y = fetch_rrlyrae_combined()

#As I was browsing around, I found some useful examples here:
#https://towardsdatascience.com/scikit-learn-decision-trees-explained-803f3812290d
#https://medium.com/@rnbrown/creating-and-visualizing-decision-trees-with-python-f8e8fa394176

In [None]:
#Let's read in the data in a data frame, and take a look at them

Train = pd.read_csv('RRLTrainSet.csv', index_col = 0)
Test = pd.read_csv('RRLTestSet.csv', index_col = 0)

In [None]:
#This builds the four arrays (features/labels x train/test) needed by all sklearn ML models

X_train = Train.drop(['label'], axis=1)
y_train = Train['label']

X_test = Test.drop(['label'], axis=1)
y_test = Test['label']

### First model alert!

This is how we build a model in sklearn. For reproducibility purposes, we will fix the random seed in the Decision Tree. But what is randomized in Decision Trees?


From the docs: The features are always randomly permuted at each split. 
Therefore, the best found split may vary, even with the same training data 
and max_features=n_features, if the improvement of the criterion is identical 
for several splits enumerated during the search of the best split. 
To obtain a deterministic behaviour during fitting, random_state has to be fixed.

In [None]:
#Fill



You might recognize a few familiar attributes up there.

And this is how we fit a model! Fitting a model means that we build the architecture to make decisions.

In [None]:
#Fill



At this point, the model has built a set of questions (splits) that would inform the classification.

This bit below is to visualize the tree.

In [None]:
dot_data = StringIO()
export_graphviz(
            model,
            out_file =  dot_data,
            feature_names = list(X_train.columns),
            class_names = ['Not var','Var'],
            filled = True,
rounded = True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

Questions: 

What is the accuracy score (% of correct classifications) on the training set, based on the tree?
    
How can we visualize it?

How can we figure out the accuracy on the test set?

In [None]:
#Fill!



This cell below shows us the splits made by the decision tree above.

In [None]:
#Fill splits

#Plots training data
plt.scatter(X_train['u-g'], X_train['r-i'], \
            c = y_train.iloc[:,0].values, marker = '*', s =20, label = None, cmap = 'brg')
plt.axvline(x=   , linewidth =1, label = '1st split')
plt.axvline(x=   , linewidth =1, ls = '--', label = '2nd split')
plt.axhline(y=   , linewidth =1, ls = '-.', xmin = 0.32, xmax=0.48, label = '3rd split')

#Plots test data
plt.scatter(X_test['u-g'], X_test['r-i'], \
            c = y_test.iloc[:,0].values, marker = 'o', s =20, label = None, cmap = 'brg')

plt.legend();

### Ok, now time to get real!

In [None]:
Xbig = pd.read_csv('RRLyrae_features.txt', names = ['u-g', 'g-r', 'r-i', 'i-z'])
ybig = pd.read_csv('RRLyrae_labels.txt', header = None).astype(int)

Let's plot ALL the data, ahem!

In [None]:
plt.scatter(Xbig['u-g'], Xbig['r-i'], \
            c = ybig.iloc[:,0].values, marker = '*', s =20, label = None, cmap = 'brg')

### Let's do some data thinking.

<br>
What can we say about this data set?

Do you expect a decision tree to be an optimal classifier, based on the shape of the data?

How would a classifier that puts everything in the "non-RR Lyrae" box fare on this data set?

In [None]:
#Answers go here :) 



Let's take a look at how our previous algorithm would fare on this data set.

In [None]:
plt.scatter(Xbig['u-g'], Xbig['r-i'], \
            c = ybig.iloc[:,0].values, marker = '*', s =20, label = None, cmap = 'brg')
plt.axvline(x=0.218, linewidth =1, label = '1st split')
plt.axvline(x=0.147, linewidth =1, ls = '--', label = '2nd split')
plt.axhline(y=0.035, linewidth =1, ls = '-.', xmin = 0.53, xmax=0.65, label = '3rd split')
plt.legend();

Question: How is our old tree doing? What is it getting right and wrong?

In [None]:
#Answers go here.

Let's do our training process again! Here we don't have separate train and test splits so we can create them, we'll call them X_trainb, X_testb etc (for "big"). Note: we are not doing cross validation yet, which is bad!

In [None]:
#Fill


Define and fit the model:

In [None]:
#Fill



We can use the same plotting routine as above to visualize the new tree:

In [None]:
dot_data = StringIO()
export_graphviz(
            modelbig, #note name change
            out_file =  dot_data,
            feature_names = list(X_trainb.columns), #here too
            class_names = ['Not var','Var'],
            filled = True,
rounded = True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

Now let's take a look at those colors and then evaluate how the tree is doing on the training set.

In [None]:
#Fill













The accuracy is high, as expected! But is it meaningful? 

In an unbalanced data set, other metrics are needed.

In [None]:
#Define recall, estimate



In [None]:
#Define precision, estimate



### Recap: what have we seen so far?

Let's talk about what we should be doing to optimize this classifier, going back to the tools we mentioned.

In [None]:
#Ideas here.



In [None]:
# We can customize this cell as we try new models

modelX = DecisionTreeClassifier(......)
modelX.fit(X_trainb,y_trainb)
dot_data = StringIO()
export_graphviz(
            modelX,
            out_file =  dot_data,
            feature_names = list(X_trainb.columns),
            class_names = ['Not var','Var'],
            filled = True,
rounded = True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

In [None]:
#And then look at some of these

print(metrics.accuracy_score(y_trainb, modelX.predict(X_trainb)))
print(metrics.precision_score(y_trainb, modelX.predict(X_trainb)))
print(metrics.recall_score(y_trainb, modelX.predict(X_trainb)))

print(metrics.accuracy_score(y_testb, modelXpredict(X_testb)))
print(metrics.precision_score(y_testb, modelX.predict(X_testb)))
print(metrics.recall_score(y_testb, modelX.predict(X_testb)))