In [None]:
# This is a notebook to show some simple regression algorithms and metrics. 

#Author: Viviana Acquaviva

#License: BSD but really should be TBD - just be nice.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import time

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import scale
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

#Just to make our life easier!
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning) 
warnings.filterwarnings("ignore", category=FutureWarning) 

%matplotlib inline

### Regression problems: in which we'd like to predict a continuous quantity, not a class.

To make our life simple, we will recycle one of our previous problems (the LAE vs OII classification) and re-cast it as a regression problem. Obviously it's not the best example, we will see another one later. 

In [None]:
data = pd.read_csv('LAE_OII_CCA.txt', sep = '\t', comment = '#')
data.columns =['type', 'wavelength', 'ELflux', 'continuum', 'EW']
seldata = data[(np.abs(stats.zscore(data.drop(['type'],axis=1))) < 3).all(axis=1)]
le = LabelEncoder()
newcol = le.fit_transform(seldata.type.values)
seldata.ix[:,'type'] = newcol
X, y = seldata.drop('type',axis=1), seldata.type
normalized_X = (X - X.mean())/X.std()

In [None]:
normalized_X.head()

In [None]:
plt.hist(y);

### Q: What is the main implementation difference between classification and regression problems?

......

















In [None]:
#This line shows all the possible scoring functions accepted by sklearn.

sorted(sklearn.metrics.SCORERS.keys())

Let's build a regression model using Random Forests, picking a scoring parameter from the list above.

In [None]:
# Fill fill fill


Another one that is used often is the 'r2' score. It's often called the coefficient of determination, usually denoted as R². It represents the proportion of variance (of y) that has been explained by the predictions. It's read as a square but it can be negative if the model is worse than a predictor of the mean :/

It's defined as 

$$ R^2(y, \hat{y}) = 1 - \frac{\sum_{i=1}^{n} (y_i - \hat{y}_i)^2}{\sum_{i=1}^{n} (y_i - \bar{y})^2} $$

In [1]:
#### Fill fill fill





Note: Even if 1 is the best value, the R2 score is not a percentage! So how can we understand how good the predictions are?

A good start is to visualize the predictions.

In [None]:
ypred = ....

In [None]:
plt.hist(y, bins = 20, alpha = 0.5, label = 'True')
plt.hist(ypred, bins = 20, alpha = 0.5, label = 'Pred')
plt.legend();

We could also take a look at a scatter plot...

In [None]:
plt.scatter(y,ypred)

Let's see what this "regressor" is doing, e.g. how the points with intermediate probability look like, in a 2D projection.

In [None]:
fig = plt.figure(figsize=(10,5))

plt.subplot(1,2,1)

plt.scatter(normalized_X['EW'], normalized_X['continuum'], c = y)
plt.xlim([-0.25,-0.2])
plt.ylim([-0.3,1])

plt.subplot(1,2,2)

plt.scatter(normalized_X['EW'], normalized_X['continuum'], c = ypred)
plt.xlim([-0.25,-0.2])
plt.ylim([-0.3,1])

### Optimization

We already know how to play with parameters using Grid Search CV, so no big surprise here.

In [None]:
model

We can do a little CV optimization, note that we have lost our class weight parameter though.

In [None]:
cvmethod = StratifiedKFold(n_splits=5, shuffle = True)

parameters = {'max_depth':[3,5,8], \
              'max_features': [2,4], 'n_estimators':[10,20,50]}

nmodels = np.product([len(el) for el in parameters.values()])

model = GridSearchCV(RandomForestRegressor(), parameters, cv = cvmethod, \
                     scoring = 'neg_mean_absolute_error', \
    verbose = 1, n_jobs = 4)
start = time.time()
model.fit(normalized_X, y)
stop = time.time()
print('Best params, best score:', "{:.4f}".format(model.best_score_), model.best_params_),
print('Time per model (s):', "{:.4f}".format((stop-start)/float(nmodels*4)))

In [None]:
pd.options.display.max_colwidth = 100 

In [None]:
scores_df = pd.DataFrame(model.cv_results_)

In [None]:
scores_df[['params', 'mean_test_score', 'mean_train_score', 'mean_fit_time']].sort_values(by = 'mean_test_score', ascending = False)

Let's discuss which parameters are best.

### What is best, classification or regression?

Many classification problems can be turned into regression problems. So, which one should we pick?

In [None]:
predicted_RFC = cross_val_predict(...)

predicted_RFR = cross_val_predict(....)

plt.hist(predicted_RFC,alpha = 0.5, color='green',label = 'RF Classifier')
plt.hist(predicted_RFR,alpha = 0.5, label = 'RF Regressor')
plt.hist(y+0.05,alpha = 0.5, label = 'True')

plt.legend();

Tricks: 

1) Look at input data, if they really look like classes then you should use a classifier.

2) You can't compute the accuracy if the target is continuous variable, but you can compute other metric scores if the target is a class! However I am not sure if this helps (see above).

In [None]:
print(metrics.mean_absolute_error(y, predicted_RFC))

print(metrics.mean_absolute_error(y, predicted_RFR))

### Last time, we saw that many classifiers also have a "predict.proba" method.

How does the above compare to the "predict.proba" property of classifiers?

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,random_state=5)

probas = RandomForestClassifier(n_estimators=20,max_depth=5).fit(Xtrain, ytrain).predict_proba(Xtest) #doing only on one fold

probas2 = RandomForestRegressor(n_estimators=20,max_depth=5).fit(Xtrain, ytrain).predict(Xtest)

In [None]:
plt.hist(probas[:,1],alpha = 0.5, color='green',label = 'RF Classifier')
plt.hist(probas2,alpha = 0.5, label = 'RF Regressor')

plt.legend();

It looks like under the hood, classifiers with a "predict_proba" attribute are using a regression model to fit the probability that the object belongs to a given (positive for a binary classifier) class.

### How to estimate errors in ML regression predictions?

My answer: Bootstrap. This doesn't take into account the "architecture" error. 

Let's have a simple example. We know that EW, continuum and emission line flux are not independent. So let's try to predict one from the other two.

In [None]:
features = normalized_X[['EW','continuum']]
target = normalized_X['ELflux'].values

In [None]:
features.shape

In [None]:
target

In [None]:
model = RandomForestRegressor()

ypred = cross_val_predict(model, features, target)

Let's look at a couple of evaluation metrics, and also make a scatter plot.

We can build 10 bootstrap samples where each point is randomly scattered by an amount proprtional to it observational arror (here 1%).

In [None]:
bootstrap_pred = np.empty((len(target),10))

for i in range(10):
    newEW = features.EW + np.random.normal(0, np.abs(0.01*features.EW)) 
    newcont = features.continuum + np.random.normal(0, np.abs(0.01*features.continuum)) 
    feat = np.vstack([newEW,newcont]).T
    bootstrap_pred[:,i] = cross_val_predict(model, feat, target)

In [None]:
#This prints the prediction + its standard deviation, which can be used as proxy for uncertainty

for i in range(len(target)):
    print('%.3f \t %.3f \t %.3f' % (target[i], ypred[i], bootstrap_pred[i,:].std()))

In [None]:
#And finally, we can make a scatter plot with error bars.

plt.errorbar(target, ypred, yerr=bootstrap_pred.std(axis = 1), fmt='o', markersize = 2, c= 'k',ecolor='lightgray', elinewidth=1)
plt.plot(np.arange(10),np.arange(10),c='r')
plt.xlabel('True flux');
plt.ylabel('Est. flux');

### Summary

- In regression problems, we predict quantities and not classes.

- We can use the same algorithms, but the evaluation metric changes and is typically a tracer of how close we are to the true values.

- We also saw how we can use bootstrap to include individual errors in our predictions.

Further reading: For a complete worked example you can see the "mass-luminosity-color-metallicity" notebook at

https://github.com/vacquaviva/Metallicity_Estimation