Answer all questions and submit them either as an IPython notebook, LaTeX document, or Markdown document. Provide full answers for each question, including interpretation of the results. Each question is worth 25 points.

This homework is due on Friday, December 8, 2017.

## Question 1

The `titanic.xls` spreadsheet in the `data` directory contains data regarding the passengers on the Titanic when it sank in 1912. A recent [Kaggle competition](http://www.kaggle.com/c/titanic-gettingStarted) was based on predicting survival for passengers based on the attributes in the passenger list. 

Use scikit-learn to build both a support vector classifier and a logistic regression model to predict survival on the Titanic. Use cross-validation to assess your models, and try to tune them to improve performance.

Discuss the benefits and drawbacks of both approaches for application to such problems.

In [1]:
# Write your work here
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
# for windows
# file = 'C:/Users/Rui/source/BIOS_8366/data/titanic.xls'
# for mac
file = '/Users/ruiwang/source/BIOS_8366/data/titanic.xls'
data = pd.read_excel(file)
data.head()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/ruiwang/source/BIOS_8366/data/titanic.xls'

In [None]:
# look at the distribution of our interest: isbadbuy
prop = data['survived'].value_counts()/len(data)
print(prop)
prop.plot(kind = 'pie')

### Data cleaning

In [None]:
for i in data:
    data['initial']=data.name.str.extract('([A-Za-z]+)\.') #lets extract the Salutations
# replace intials with spelling error
data['initial'].replace(['Mlle','Mme','Ms','Dr','Major','Lady','Countess',
                         'Jonkheer','Col','Rev','Capt','Sir','Don', 'Dona'],
                        ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other',
                         'Other','Other','Mr','Mr','Mr','Mr'],inplace=True)
data.groupby('initial')['age'].mean() #lets check the average age by Initials

## Assigning the NaN Values with the Ceil values of the mean ages
data.loc[(data.age.isnull())&(data.initial=='Mr'),'age']=33
data.loc[(data.age.isnull())&(data.initial=='Mrs'),'age']=36
data.loc[(data.age.isnull())&(data.initial=='Master'),'age']=5
data.loc[(data.age.isnull())&(data.initial=='Miss'),'age']=22
data.loc[(data.age.isnull())&(data.initial=='Other'),'age']=46

# As we saw that maximum passengers boarded from Port S, we replace NaN with S.
data['embarked'].fillna('S',inplace=True)
data['age_band']=0
data.loc[data['age']<=16,'age_band']=0
data.loc[(data['age']>16)&(data['age']<=32),'age_band']=1
data.loc[(data['age']>32)&(data['age']<=48),'age_band']=2
data.loc[(data['age']>48)&(data['age']<=64),'age_band']=3
data.loc[data['age']>64,'age_band']=4

data['family_size']=0
data['family_size']=data['parch']+data['sibsp'] #family size
data['alone']=0
data.loc[data.family_size==0,'alone']=1 #Alone

#data['Fare_Range']=pd.qcut(data['fare'],4)
#data.groupby(['Fare_Range'])['survived'].mean().to_frame().style.background_gradient(cmap='summer_r')

data['fare_cat']=0
data.loc[data['fare']<=7.91,'fare_cat']=0
data.loc[(data['fare']>7.91)&(data['fare']<=14.454),'fare_cat']=1
data.loc[(data['fare']>14.454)&(data['fare']<=31),'fare_cat']=2
data.loc[(data['fare']>31)&(data['fare']<=513),'fare_cat']=3

data['sex'].replace(['male','female'],[0,1],inplace=True)
data['embarked'].replace(['S','C','Q'],[0,1,2],inplace=True)
data['initial'].replace(['Mr','Mrs','Miss','Master','Other'], [0,1,2,3,4], inplace=True)
data.drop(['name','ticket','cabin', 'age', 'fare', 'home.dest', 'boat', 'body'],axis=1,inplace=True)

# rearrange the columns
data = data[['survived', 'pclass', 'sex', 'sibsp', 'parch', 'embarked', 
             'initial', 'age_band', 'family_size', 'alone', 'fare_cat']]

In [None]:
sns.heatmap(data.corr(),annot=True,cmap='RdYlGn',linewidths=0.2,annot_kws={'size':12})
fig=plt.gcf()
fig.set_size_inches(10,10)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

### Modeling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

y = data['survived'].values
X = data.drop(['survived'], axis = 1).values
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

### Support Vector Classifier

In [None]:
tuned_parameters={'kernel':['rbf','linear'],
                  'C': np.logspace(-5, 5, 10),
                  'gamma':np.logspace(-5, 5, 10)}

scores = ['accuracy']

clf = GridSearchCV(SVC(max_iter=1000, tol = 1e-6, random_state = 42), 
                   tuned_parameters, cv=5, n_jobs = -1,
                  scoring='%s' % score)
clf.fit(X_train, y_train)
y_true, y_pred = y_test, clf.predict(X_test)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Best score set found on development set:")
print()
print("Detailed classification report:")
print()
print(classification_report(y_true, y_pred))
plot_confusion_matrix(confusion_matrix(y_test, y_pred), classes = np.array(["0", "1"]))

In [None]:
tuned_parameters={'kernel':['rbf','linear'],
                  'C': np.logspace(-5, 5, 10),
                  'gamma':np.logspace(-5, 5, 10)}

scores = ['f1']

clf = GridSearchCV(SVC(max_iter=1000, tol = 1e-6, random_state = 42), 
                   tuned_parameters, cv=5, n_jobs = -1,
                  scoring='%s' % score)
clf.fit(X_train, y_train)
y_true, y_pred = y_test, clf.predict(X_test)

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Best score set found on development set:")
print()
print("Detailed classification report:")
print()
print(classification_report(y_true, y_pred))
plot_confusion_matrix(confusion_matrix(y_test, y_pred), classes = np.array(["0", "1"]))

### Logistic Regreesion Classifier

In [None]:
tuned_parameters = { 'C': np.logspace(-5, 10, 30),
                   'penalty': ['l1', 'l2'],
                   }

scores = ['accuracy']

clf = GridSearchCV(LogisticRegression(max_iter=1000, tol = 1e-6, random_state = 42), 
                   tuned_parameters, cv=5, n_jobs = -1,
                  scoring='%s' % score)
clf.fit(X_train, y_train)
y_true, y_pred = y_test, clf.predict(X_test)
labels = [0, 1]

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Best score set found on development set:")
print()
print("Detailed classification report:")
print()
print(classification_report(y_true, y_pred))
print()
plot_confusion_matrix(confusion_matrix(y_test, y_pred), classes = np.array(["0", "1"]))

In [None]:
tuned_parameters = { 'C': np.logspace(-5, 10, 30),
                   'penalty': ['l1', 'l2'],
                   }

scores = ['f1']

clf = GridSearchCV(LogisticRegression(max_iter=1000, tol = 1e-6, random_state = 42), 
                   tuned_parameters, cv=5, n_jobs = -1,
                  scoring='%s' % score)
clf.fit(X_train, y_train)
y_true, y_pred = y_test, clf.predict(X_test)
labels = [0, 1]

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Best score set found on development set:")
print()
print("Detailed classification report:")
print()
print(classification_report(y_true, y_pred))
print()
plot_confusion_matrix(confusion_matrix(y_test, y_pred), classes = np.array(["0", "1"]))

### Model Comparison

Discuss the benefits and drawbacks of both approaches for application to such problems.

a. Linear SVMs and logistic regression generally perform comparably in practice. Use SVM with a nonlinear kernel if you have reason to believe your data won't be linearly separable (or you need to be more robust to outliers than LR will normally tolerate). Otherwise, just try logistic regression first and see how you do with that simpler model. If logistic regression fails you, try an SVM with a non-linear kernel like a RBF. 

b. Set $p$ as number of features and $n$ as number of training examples. 

    1. If p >> n, apply logistic regression or linear SVMs.
    2. If p is small and n is intermediate, use SVM with a nonlinear kernel.
    3. If n >> p, create or add more features will help and then use logistics regression or linear SVMs.
    
c. Neural network likely to work well for most of these settings but might be slower to train.

## Question 2

The file `TNNASHVI.txt` in your data directory contains daily temperature readings for Nashville, courtesy of the [Average Daily Temperature Archive](http://academic.udayton.edu/kissock/http/Weather/). This data, as one would expect, oscillates annually. Using PyMC3, use a Gaussian process to fit a non-parametric regression model to this data, choosing an appropriate covariance function. Plot 10 regression lines drawn from your process.

In [None]:
%matplotlib inline

import pandas as pd
import pymc3 as pm

daily_temps = pd.read_table("../data/TNNASHVI.txt", sep='\s+', 
                            names=['month','day','year','temp'], na_values=-99)
daily_temps.head()

In [None]:
# daily temperature
daily_temps.loc[daily_temps['year'] > 2010, 'temp'].plot(style='b.', figsize=(10,6), grid=False)

In [None]:
# choose the temperature from 2011-2013
df = daily_temps[daily_temps.year > 2010]
df.isnull().sum()

#### Model Selection

The temperature varies very periodically. From the plot above, there seems no trend of increasing or decreasing gradually anually. Therefore, I choose to use two parts to fit the data: one is the seasonal part and the other is noise part. 

In [None]:
# Write your answer here

y = df.temp.values
t = np.arange(len(df.temp)).reshape(-1, 1)

In [None]:
with pm.Model() as model:
    # yearly periodic component x long term trend
    ls = pm.HalfCauchy('ls', 1)
    η = pm.HalfCauchy('η', 1)
    
    cov_seasonal = η**2 * pm.gp.cov.Periodic(1, period = 365, ls = ls)
    gp_seasonal = pm.gp.Marginal(cov_func=cov_seasonal)
    
    # noise model
    σ = pm.HalfCauchy("σ", 1)
    
    y_ = gp_seasonal.marginal_likelihood("y", X=t, y=y, noise=σ)

In [None]:
with model:
    start = pm.find_MAP(include_transformed=True)
    step = pm.Metropolis()
    trace = pm.sample(1000, step=step, start=start)
    pm.traceplot(trace, varnames=["η", "ls", "σ"])

In [None]:
with model:
    pred = gp_seasonal.conditional("temp_fit", t)
    samples = pm.sample_ppc(trace, vars=[pred], samples=10)

In [None]:
fig = plt.figure(figsize=(12,5)); ax = fig.gca()

plt.scatter(x=t, y=df.temp, c='b', s=50, label="observed data")

for x in samples['temp_fit']:
    plt.plot(t, x, label = "predicted data")
    
plt.xlabel("days from 2011"); plt.ylim([10,90]);
plt.title("Nashville Daily Temperature"); 
plt.legend();
plt.show()

## Question 3

Fit a series of random-forest classifiers to the Wisconsin breast cancer dataset (`wisconsin_breast_cancer.csv`), to explore the sensitivity to the parameters `max_features`, the number of variables considered for splitting at each step, `max_depth`, the maximum depth of the tree, and `n_estimators`, the number of trees in the forest. Use apprpriate metrics of performance, and include plots against a suitably-chosen range of values for these parameters.

Dataset description: Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image. Ten real-valued features are computed for each cell nucleus:

- `radius` (mean of distances from center to points on the perimeter) 
- `texture` (standard deviation of gray-scale values) 
- `perimeter` 
- `area` 
- `smoothness` (local variation in radius lengths) 
- `compactness` (perimeter^2 / area - 1.0) 
- `concavity` (severity of concave portions of the contour) 
- `concave points` (number of concave portions of the contour) 
- `symmetry` 
- `fractal dimension` ("coastline approximation" - 1)

The outcome to be predicted is tumor type (M = malignant, B = benign).

In [None]:
# Write your work here
file = '/Users/ruiwang/source/BIOS_8366/data/wisconsin_breast_cancer.csv'
df = pd.read_csv(file)
df['diagnosis'].replace(['M','B'],[0,1],inplace=True)
df.drop(['id'], axis = 1, inplace = True)
df.head()

In [None]:
# look at the distribution of our interest: isbadbuy
prop = df['diagnosis'].value_counts()/len(df)
print(prop)
prop.plot(kind = 'pie')

From the above pie figure, the target we predict is almost a balanced dataset (63% vs 37%). Our prediction target is whether the breast cancer is malignant (1) or benign (0). For these similar problems, the analysis is sensitive to the false negative, which can be translated by several metrics like precision ($\frac {tp}{tp+fn}$), accuracy ($\frac {tn + tp} {tn + tp + fn + fp}$) or f1 score ($2*\frac {recall*precision}{recall + precision}$).

For classification, it is better to plot the confusion matrix or precision-recall curve (ROC curve).

#### Use recall as scoring to do parameter optimization.

In [None]:
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier


y = df['diagnosis'].values
X = df.drop(['diagnosis'], axis = 1).values
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

# specify parameters and distributions to sample from
tuned_parameters = {"n_estimators": [10, 20, 50, 100, 500], 
                    "max_depth": [3, 5, 10, 20], 
                    "max_features": [1, 3, 5, 10]
                    }

scores = ['recall']

clf = GridSearchCV(RandomForestClassifier(), 
                   tuned_parameters,
                   cv=5, n_jobs = -1, scoring='%s' % score)
clf.fit(X_train, y_train)
y_true, y_pred = y_test, clf.predict(X_test)

labels = [0, 1]

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Best score set found on development set:")
print()
print("Detailed classification report:")
print()
print(classification_report(y_true, y_pred))
print()
plot_confusion_matrix(confusion_matrix(y_test, y_pred), classes = np.array(["0", "1"]))   

#### Use accuracy as scoring to do parameter optimization.

In [None]:
# specify parameters and distributions to sample from
tuned_parameters = {"n_estimators": [10, 20, 50, 100, 500], 
                    "max_depth": [3, 5, 10, 20], 
                    "max_features": [1, 3, 5, 10]
                    }

scores = ['accuracy']

clf = GridSearchCV(RandomForestClassifier(), 
                   tuned_parameters,
                   cv=5, n_jobs = -1, scoring='%s' % score)
clf.fit(X_train, y_train)
y_true, y_pred = y_test, clf.predict(X_test)

labels = [0, 1]

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Best score set found on development set:")
print()
print("Detailed classification report:")
print()
print(classification_report(y_true, y_pred))
print()
plot_confusion_matrix(confusion_matrix(y_test, y_pred), classes = np.array(["0", "1"]))  

#### Use f1 score as scoring to do parameter optimization.

In [None]:
# specify parameters and distributions to sample from
tuned_parameters = {"n_estimators": [10, 20, 50, 100, 500], 
                    "max_depth": [3, 5, 10, 20], 
                    "max_features": [1, 3, 5, 10]
                    }

scores = ['f1']

clf = GridSearchCV(RandomForestClassifier(), 
                   tuned_parameters,
                   cv=5, n_jobs = -1, scoring='%s' % score)
clf.fit(X_train, y_train)
y_true, y_pred = y_test, clf.predict(X_test)

labels = [0, 1]

print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Best score set found on development set:")
print()
print("Detailed classification report:")
print()
print(classification_report(y_true, y_pred))
print()
plot_confusion_matrix(confusion_matrix(y_test, y_pred), classes = np.array(["0", "1"]))  

## Question 4

Use a grid search to optimize the number of estimators and max_depth for a Gradient Boosted Decision tree using the Wisconsin breast cancer data. Plug this optimal ``max_depth`` into a *single* decision tree.  Does this single tree over-fit or under-fit the data? Repeat this for the Random Forest.  Construct a single decision tree using the ``max_depth`` which is optimal for the Random Forest.  Does this single tree over-fit or under-fit the data?

For parameters optimization in grid search, I choose to apply recall metric as scoring since it is most sensitive for patient features in cancer research.

### Gradient Boosted Decision tree

In [None]:
# Write your work here
from sklearn.ensemble import GradientBoostingClassifier

# specify parameters and distributions to sample from
tuned_parameters = {"n_estimators": [10, 20, 50, 100, 200, 300, 500, 1000], 
                    "max_depth": [3, 5, 10, 15, 20, 50] 
                    }

clf = GridSearchCV(GradientBoostingClassifier(), 
                   tuned_parameters,
                   cv=5, n_jobs = -1, scoring='recall')
clf.fit(X_train, y_train)
y_true, y_pred = y_test, clf.predict(X_test)

labels = [0, 1]

print("Optimize based on %s metric" % score)
print()
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Best score set found on development set:")
print()
print("Detailed classification report:")
print()
print(classification_report(y_true, y_pred))
print()
plot_confusion_matrix(confusion_matrix(y_test, y_pred), classes = np.array(["0", "1"])) 

In [None]:
# check overfit or underfit
clf1 = GradientBoostingClassifier(n_estimators = 1, max_depth = 3)
clf1.fit(X_train, y_train)
y_true, y_pred = y_test, clf.predict(X_test)

labels = [0, 1]

print("Detailed classification report:")
print()
print(classification_report(y_true, y_pred))
print()
plot_confusion_matrix(confusion_matrix(y_test, y_pred), classes = np.array(["0", "1"])) 

From the above confusion matrix, we can conclude a single decision tree does not overfit for test data.

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# specify parameters and distributions to sample from
tuned_parameters = {"n_estimators": [10, 20, 50, 100, 200, 300, 500, 1000], 
                    "max_depth": [3, 5, 10, 15, 20, 50] 
                    }

clf = GridSearchCV(RandomForestClassifier(), 
                   tuned_parameters,
                   cv=5, n_jobs = -1, scoring='recall')
clf.fit(X_train, y_train)
y_true, y_pred = y_test, clf.predict(X_test)

labels = [0, 1]

print("Optimize based on %s metric" % score)
print()
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Best score set found on development set:")
print()
print("Detailed classification report:")
print()
print(classification_report(y_true, y_pred))
print()
plot_confusion_matrix(confusion_matrix(y_test, y_pred), classes = np.array(["0", "1"]))

In [None]:
# check overfit or underfit
clf1 = RandomForestClassifier(n_estimators = 1, max_depth = 5)
clf1.fit(X_train, y_train)
y_true, y_pred = y_test, clf.predict(X_test)

labels = [0, 1]

print("Detailed classification report:")
print()
print(classification_report(y_true, y_pred))
print()
plot_confusion_matrix(confusion_matrix(y_test, y_pred), classes = np.array(["0", "1"])) 

From the above confusion matrix, we can conclude a single tree using Random Forest does not overfit for test data.