In [204]:
import pickle
from sklearn.metrics         import confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.model_selection import cross_val_predict
from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import sklearn.preprocessing as sk_pp
from sklearn.decomposition import PCA, KernelPCA
import pandas as pd
from sklearn.model_selection import GridSearchCV

# 1. Data Preparation

In assignment two, we used classification models to predict if the median earnings of a school's students exceeds $30.6k, which is the median value of the earnings in the training data. In this assignment, we will use random forest models to predict the same thing. However, the dimensionality of the model will be reduced. We want to know whether reduce dimensionality could increase the model accuracy.   

Before we start, let's import the data and add flag variable as we did in assignment two.

In [2]:
college = pickle.load(open("college.p","rb"))
college_label = pickle.load(open("college_label.p","rb"))
college_test = pickle.load(open("college_test.p","rb"))
college_test_label = pickle.load(open("college_test_label.p","rb"))

In [3]:
quantiles = college_label.quantile(q=0.5)

In [4]:
def median_code(data):
    if data < quantiles:
        return 0
    else:
        return 1

In [82]:
college_label_median = college_label.map(median_code)

In [83]:
college_test_label_median = college_test_label.map(median_code)

In [84]:
college_centered = college - college.mean(axis=0)
college_test_centered = college_test - college.mean(axis=0)

# 2. Choosing the right number of dimensions

We plan to use PCA in Scikit-Learn to implements PCA. There is a very important argument that we need to specify when using the PCA, that is the number of dimensions to reduce down to. Instead of arbitrarily choosing a number, we would like to choose the number of dimensions that add up to a sufficiently large portion of the variance.

In [176]:
pca = PCA()

In [178]:
pca.fit(college_centered)

PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [189]:
pca.explained_variance_ratio_[0:10]

array([ 0.39044101,  0.20941999,  0.07755925,  0.05390012,  0.03532041,
        0.03255564,  0.02523042,  0.02201691,  0.01682056,  0.01401534])

The first two components explanis close to 60% of the variances of the original dataset. Let's take a closer look at these two components.

In [190]:
E1, E2 = pd.DataFrame(pca.components_[0]), pd.DataFrame(pca.components_[1])
eigenvectors = pd.concat([features,E1,E2],axis = 1)
eigenvectors.columns = ['Features','E1','E2']
print(eigenvectors)

              Features        E1            E2
0   MedianFamilyIncome  0.504568  8.814541e-02
1     DebtNonCompleter  0.381319 -1.293074e-01
2             FirstGen -0.486701 -1.324599e-01
3              IndPerc -0.424426 -3.250677e-01
4             LoanPerc  0.127373 -6.345740e-01
5             NetPrice  0.258141 -5.676994e-01
6                   AK -0.132719 -2.402558e-01
7                   AL  0.147133 -1.423529e-02
8                   AR -0.014414  2.544911e-01
9                   AS -0.048196  4.528846e-02
10                  AZ  0.195082  5.741809e-03
11                  CA -0.138619 -3.349732e-02
12                  CO  0.001959 -4.597992e-07
13                  CT -0.010226 -1.753249e-02
14                  DC -0.000237  4.754260e-04
15                  DE  0.000304  3.375011e-03
16                  FL -0.001362  3.846081e-03
17                  FM -0.000044  4.012223e-04
18                  GA -0.003924 -5.350365e-03
19                  GU -0.010737 -3.199662e-03
20           

The first component is strongly correlated with *MedianFamilyIncome*, and *FirstGen*. To be more specific, the first component can be viewed as a measure of family impact. It increases if a campus has high median family income, and low percentage of students who are the first generation to be highly educated in their family.   

The second component is strongly correlated with *NetPrice* and *IndPerc*. This component can be viewed as a measure of the financial burden of students in that campus. If the campus has a low percent of financial independent student and high net price, indicating students have relative heavier financial burden, this componen will increases.

Before applying PCA to predictive models, we need to find out the number of dimensions that add up to a sufficiently large portion of the variance. In our case, we want to know how many dimensions that could explain 90% or 95% of the variance. 

In [93]:
cumsum = np.cumsum(pca.explained_variance_ratio_)

In [94]:
d1 = np.argmax(cumsum >= 0.90) + 1
d2 = np.argmax(cumsum >= 0.95) + 1
print(d1, d2)

13 25


Therefore, in the next steps, we will build random forest models with datasets contain 13 components and 25 components respectivily.

# 3.  Random Forest Model with 13 components

### 3.1 Random forest model with PCA

In [233]:
pca = PCA(n_components=d1, random_state = 42)
college_reduced = pca.fit_transform(college_centered)
rf_clf = RandomForestClassifier(random_state = 42)

#### Performance comparison on the train set

In [234]:
pred_rf_reduced = cross_val_predict(rf_clf, college_reduced, college_label_median, cv=3)
pred_rf = cross_val_predict(rf_clf, college, college_label_median, cv=3)

In [235]:
print('Random Forest with PCA\n' + classification_report(college_label_median, pred_rf_reduced)) 

Random Forest with PCA
             precision    recall  f1-score   support

          0       0.77      0.85      0.81      2403
          1       0.83      0.74      0.79      2404

avg / total       0.80      0.80      0.80      4807



In [236]:
print('Random Forest without PCA\n' + classification_report(college_label_median, pred_rf)) 

Random Forest without PCA
             precision    recall  f1-score   support

          0       0.80      0.87      0.83      2403
          1       0.86      0.78      0.82      2404

avg / total       0.83      0.82      0.82      4807



From the classification report, we can see that the random forest model without PCA outperforms the random forest model with PCA.

#### Performance comparison on the test set

In [237]:
college_test_reduced = pca.fit_transform(college_test_centered)
rf_clf.fit(college_reduced, college_label_median)
test_pred_reduced = rf_clf.predict(college_test_reduced)

In [238]:
print('Random Forest with PCA\n' + classification_report(college_test_label_median,test_pred_reduced)) 

Random Forest with PCA
             precision    recall  f1-score   support

          0       0.69      0.69      0.69       612
          1       0.70      0.70      0.70       631

avg / total       0.70      0.70      0.70      1243



In [239]:
rf_clf.fit(college, college_label_median)
test_pred = rf_clf.predict(college_test)

In [240]:
print('Random Forest without PCA\n' + classification_report(college_test_label_median,test_pred)) 

Random Forest without PCA
             precision    recall  f1-score   support

          0       0.86      0.92      0.89       612
          1       0.91      0.85      0.88       631

avg / total       0.89      0.88      0.88      1243



The random forest model performs even worse on the test set when we apply PCA to it.

##  3.2 Random forest model with Kernel PCA

In this part, the kernel PCA is applied to the random forest model. We will use grid search to select the kernel and hyperparameters that lead to the best performance.  The whole process can be separate into two steps:  

1. Reduce dimensionality to 13 dimensions using KPCA, then applying random forest classifier for classification. 
2. Use GridSearchCV to find the best kernel and gamma value for kPCA in order to get the best classification accuracy.

In [221]:
clf = Pipeline([
        ('kpca', KernelPCA(n_components=13, random_state = 42)),
        ('rf_clf', RandomForestClassifier(random_state = 42))
    ])

param_grid = [{
        "kpca__gamma" : np.linspace(0.05, 0.15, 5 ),
        "kpca__kernel" :['rbf', 'sigmoid','poly']
    }]

grid_search = GridSearchCV(clf, param_grid, cv = 3, scoring = 'accuracy')
grid_search.fit(college_reduced, college_label_median)

GridSearchCV(cv=3, error_score='raise',
       estimator=Pipeline(steps=[('kpca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
     fit_inverse_transform=False, gamma=None, kernel='linear',
     kernel_params=None, max_iter=None, n_components=13, n_jobs=1,
     random_state=42, remove_zero_eig=False, tol=0)), ('rf_clf', RandomForestCla...stimators=10, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'kpca__kernel': ['rbf', 'sigmoid', 'poly'], 'kpca__gamma': array([ 0.05 ,  0.075,  0.1  ,  0.125,  0.15 ])}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='accuracy', verbose=0)

In [222]:
print(grid_search.best_params_)

{'kpca__kernel': 'rbf', 'kpca__gamma': 0.050000000000000003}


Let's use the above parameters to build a random forest model with kernal PCA applied on the training set.

In [229]:
clf = Pipeline([
        ('kpca', KernelPCA(n_components=13, random_state = 42, gamma=0.05, kernel='rbf')),
        ('rf_clf', RandomForestClassifier(random_state = 42))
    ])
clf.fit(college_centered, college_label_median)

Pipeline(steps=[('kpca', KernelPCA(alpha=1.0, coef0=1, copy_X=True, degree=3, eigen_solver='auto',
     fit_inverse_transform=False, gamma=0.05, kernel='rbf',
     kernel_params=None, max_iter=None, n_components=13, n_jobs=1,
     random_state=42, remove_zero_eig=False, tol=0)), ('rf_clf', RandomForestClassi...stimators=10, n_jobs=1, oob_score=False, random_state=42,
            verbose=0, warm_start=False))])

** Performance on training set **

In [228]:
clf_pred = cross_val_predict(clf, college_centered, college_label_median)
print('Random Forest with Kernel PCA\n' + classification_report(college_label_median,clf_pred)) 

Random Forest with Kernel PCA
             precision    recall  f1-score   support

          0       0.78      0.86      0.82      2403
          1       0.84      0.77      0.80      2404

avg / total       0.81      0.81      0.81      4807



The performance is still worse than the random forest model without kernel PCA.

** Performance on test set **

In [231]:
clf_pred_test = clf.predict(college_test_centered)
print('Random Forest with Kernel PCA\n' + classification_report(college_test_label_median,clf_pred_test)) 

Random Forest with Kernel PCA
             precision    recall  f1-score   support

          0       0.83      0.89      0.86       612
          1       0.88      0.83      0.85       631

avg / total       0.86      0.86      0.86      1243



The kernel PCA works better than linear PCA in the random forest model. But random forest model with complete dimensionality is still the best model.

# 4. Random Forest Model with 25 components 