In [111]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline
# from matplotlib.pylab import rcParams

mig = pd.read_csv('/Users/xiaotingzhong/Documents/Matlab/Grain Tracking/data/190408_features_mig.txt')
mig_sign = pd.read_csv('/Users/xiaotingzhong/Documents/Matlab/Grain Tracking/data/190408_features_mig_signs.txt')
energy_grad = pd.read_csv('/Users/xiaotingzhong/Documents/Matlab/Grain Tracking/data/190421_features_energygrad.txt')
geo_topo = pd.read_csv('/Users/xiaotingzhong/Documents/Matlab/Grain Tracking/data/190421_features_geo_topo.txt')
other = pd.read_csv('/Users/xiaotingzhong/Documents/Matlab/Grain Tracking/data/190408_features_otherinfo.txt')

mig.columns = mig.columns.str.strip()
mig_sign.columns = mig_sign.columns.str.strip()
energy_grad.columns = energy_grad.columns.str.strip()
geo_topo.columns = geo_topo.columns.str.strip()
other.columns = other.columns.str.strip()

In [83]:
def plotConfusionMatrix(y_true, y_pred, classes,
                          normalize=True,
                          title=None,
                          rotate_xlabel=False,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
#     if not title:
#         if normalize:
#             title = 'Normalized confusion matrix'
#         else:
#             title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    if rotate_xlabel:
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    
    # save figure
    if title:
        plt.savefig(title, dpi=300, bbox_inches="tight")
    
    return ax

def plotCorrelationMatrix(df, title=None):
    # Create Correlation df
    corr = df.corr()
    # Plot figsize
    fig, ax = plt.subplots(figsize=(10, 10))

    # Drop self-correlations
    dropSelf = np.zeros_like(corr)
    dropSelf[np.triu_indices_from(dropSelf)] = True
    # Generate Heat Map, allow annotations and place floats in map
    sns.heatmap(corr, cmap="Blues", annot=True, fmt=".2f", mask=dropSelf, square=True)
    # Apply xticks
    plt.xticks(range(len(corr.columns)), corr.columns, size = 16)
    # Apply yticks
    plt.yticks(range(len(corr.columns)), corr.columns, size = 16)
    
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")
    
    # save figure
    if title:
        plt.savefig(title, dpi=300, bbox_inches="tight")
    
    # show plot
    plt.show()

    
def plotFeatureImportance(feature_importance, names, title = None):
    ax = plt.gca()
    plt.bar(range(len(feature_importance)), feature_importance)
    plt.yticks(fontsize=16)
    plt.xticks(range(len(feature_importance)), names.values, size = 16)
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
    if title:
        plt.savefig(title, dpi=300, bbox_inches="tight")
    

### EDA
- If using dihedral angles, then need to filter out nans

In [122]:
geo_topo.info()

geo_topo = geo_topo.astype(float)
mask_nonan = ~(np.any(np.isnan(geo_topo), axis=1))
geo_topo = geo_topo[mask_nonan]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7004 entries, 0 to 7003
Data columns (total 22 columns):
da_len_w_an4_l       7004 non-null object
da_len_w_an4_r       7004 non-null object
da_len_w_an4_opp     7004 non-null object
da_num_w_an4_l       7004 non-null object
da_num_w_an4_r       7004 non-null object
da_num_w_an4_opp     7004 non-null object
A_an4                7004 non-null float64
fMs_an4              7004 non-null float64
avg_FabsavgH_an4     7004 non-null float64
C_an4                7004 non-null float64
CnnC_an4             7004 non-null float64
da_len_w_diff_l      7004 non-null object
da_len_w_diff_r      7004 non-null object
da_len_w_diff_opp    7004 non-null object
da_num_w_diff_l      7004 non-null object
da_num_w_diff_r      7004 non-null object
da_num_w_diff_opp    7004 non-null object
A_diff               7004 non-null float64
fMs_diff             7004 non-null float64
avg_FabsavgH_diff    7004 non-null float64
C_diff               7004 non-null float64
Cn

# Linear Regression
- Best Subset, Forward Selection, or Lasso? (Ryan J. Tibshirani)
    - Best subset selection and forward selection usually perform similarly.
    - Lasso is preferred if low signal-to-noise ratio (SNR); Best subset if high SNR.

### Linear Regression, Vanilla

In [109]:
geo_topo.head(5)

Unnamed: 0,da_len_w_an4_l,da_len_w_an4_r,da_len_w_an4_opp,da_num_w_an4_l,da_num_w_an4_r,da_num_w_an4_opp,A_an4,fMs_an4,avg_FabsavgH_an4,C_an4,...,da_len_w_diff_r,da_len_w_diff_opp,da_num_w_diff_l,da_num_w_diff_r,da_num_w_diff_opp,A_diff,fMs_diff,avg_FabsavgH_diff,C_diff,CnnC_diff
0,152.786,121.502,85.712,151.881,115.764,92.356,119.766,2.857,0.024,7.0,...,1.292,13.976,-17.071,7.451,9.62,-9.178,-0.162,0.018,-3.0,-2.1
1,106.867,127.445,120.184,116.21,128.177,115.611,640.941,28.698,0.045,6.0,...,-20.702,8.61,0.116,-9.992,9.878,4.99,44.074,8.832,0.0,3.533
2,89.502,129.292,133.71,100.322,131.027,128.65,314.685,8.437,0.027,3.0,...,10.271,0.598,0.879,-3.353,2.475,-28.952,-4.128,0.143,1.0,1.667
3,120.642,147.915,100.673,111.746,147.624,100.629,622.16,24.919,0.04,4.0,...,-8.631,-11.772,18.854,-13.024,-5.83,19.548,-19.049,-0.974,-2.0,-6.125
4,115.396,106.136,138.583,115.81,107.572,136.618,196.775,32.469,0.165,11.0,...,-25.575,6.739,8.214,-8.287,0.073,-148.225,-23.246,0.157,-8.0,-6.771


### Linear Regression, Lasso

In [125]:
from sklearn.linear_model import Lasso

# y = energy_grad.iloc[:, 0]
# X = energy_grad.iloc[:, 1:]

# X_da = pd.concat([energy_grad.iloc[:, 1], geo_topo.iloc[:, :5], other.iloc[:, 1:]], axis=1)
X_da = geo_topo.iloc[:, :11]
da = geo_topo.iloc[:, 17]
y = da
X = X_da / X_da.max(axis=0) * 10

# X_dfms = pd.concat([energy_grad.iloc[:, 1], geo_topo.iloc[:, :5], other.iloc[:, 1:]], axis=1)
# dfms = geo_topo.iloc[:, 6]
# y = dfms
# X = X_dfms / X_dfms.max(axis=0)


lasso = Lasso(alpha=10)

lasso.fit(X, y)
print lasso.coef_
print 'R2 score: ', lasso.score(X, y)
# print 'accuracy: ', metrics.accuracy_score(lasso.predict(X), y)


[   0.            0.          -59.0662047     0.            0.
   -0.         -126.31933288 -275.99302604   32.13127708    0.
    0.        ]
R2 score:  0.2221902632187216


In [None]:
# plotCorrelationMatrixHalf(pd.concat([dfms, X_dfms, geo_topo.C_diff], axis=1), 'dFMs_corr.png')
plotFeatureImportance(X_dfms.max(axis=0).values, X.columns, 'X_dfms_max_values')

In [19]:
df = pd.concat([X, y], axis=1)

df.to_csv('X_abs_Adiff_y_geo_topo.csv', index=False)

### Linear Regression, Forward Selection
- Refs:
    - https://planspace.org/20150423-forward_selection_with_statsmodels/
    - https://xavierbourretsicotte.github.io/subset_selection.html

In [35]:
#  A_diff = 107.8 + 42.81*A_an4 + 43.38*A_an4^2

A_an4 = X.iloc[:,0]

# A_diff = 107.8 + 42.81*A_an4 + 43.38*(A_an4**2)
A_diff =  85.6 + 116*A_an4 + 44.07*(A_an4**2)

In [36]:
from sklearn.metrics import r2_score

r2_score(y, A_diff)





0.20411978686751153