In [11]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline
# from matplotlib.pylab import rcParams

# mig = pd.read_csv('/Users/xiaotingzhong/Documents/Matlab/Grain Tracking/data/190408_features_mig.txt')
# mig_sign = pd.read_csv('/Users/xiaotingzhong/Documents/Matlab/Grain Tracking/data/190408_features_mig_signs.txt')
# energy_grad = pd.read_csv('/Users/xiaotingzhong/Documents/Matlab/Grain Tracking/data/190421_features_energygrad.txt')
geo_topo = pd.read_csv('/Users/xiaotingzhong/Dropbox/190425_features_geo_topo.txt')
# other = pd.read_csv('/Users/xiaotingzhong/Documents/Matlab/Grain Tracking/data/190408_features_otherinfo.txt')

# mig.columns = mig.columns.str.strip()
# mig_sign.columns = mig_sign.columns.str.strip()
# energy_grad.columns = energy_grad.columns.str.strip()
geo_topo.columns = geo_topo.columns.str.strip()
# other.columns = other.columns.str.strip()

In [5]:
def plotConfusionMatrix(y_true, y_pred, classes,
                          normalize=True,
                          title=None,
                          rotate_xlabel=False,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
#     if not title:
#         if normalize:
#             title = 'Normalized confusion matrix'
#         else:
#             title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    if rotate_xlabel:
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    
    # save figure
    if title:
        plt.savefig(title, dpi=300, bbox_inches="tight")
    
    return ax

def plotCorrelationMatrix(df, title=None):
    # Create Correlation df
    corr = df.corr()
    # Plot figsize
    fig, ax = plt.subplots(figsize=(10, 10))

    # Drop self-correlations
    dropSelf = np.zeros_like(corr)
    dropSelf[np.triu_indices_from(dropSelf)] = True
    # Generate Heat Map, allow annotations and place floats in map
    sns.heatmap(corr, cmap="Blues", annot=True, fmt=".2f", mask=dropSelf, square=True)
    # Apply xticks
    plt.xticks(range(len(corr.columns)), corr.columns, size = 16)
    # Apply yticks
    plt.yticks(range(len(corr.columns)), corr.columns, size = 16)
    
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")
    
    # save figure
    if title:
        plt.savefig(title, dpi=300, bbox_inches="tight")
    
    # show plot
    plt.show()

    
def plotFeatureImportance(feature_importance, names, title = None):
    ax = plt.gca()
    plt.bar(range(len(feature_importance)), feature_importance)
    plt.yticks(fontsize=16)
    plt.xticks(range(len(feature_importance)), names.values, size = 16)
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
         rotation_mode="anchor")
    if title:
        plt.savefig(title, dpi=300, bbox_inches="tight")
    

### EDA
- If using dihedral angles, then need to filter out nans

In [14]:
geo_topo = geo_topo.iloc[:, :19]
# 
# geo_topo.head(3)
geo_topo = geo_topo.astype(float)
mask_nonan = ~(np.any(np.isnan(geo_topo), axis=1))
geo_topo = geo_topo[mask_nonan]
geo_topo.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6891 entries, 0 to 6941
Data columns (total 19 columns):
da_len_w_an4_l       6891 non-null float64
da_len_w_an4_r       6891 non-null float64
da_len_w_an4_opp     6891 non-null float64
da_num_w_an4_l       6891 non-null float64
da_num_w_an4_r       6891 non-null float64
da_num_w_an4_opp     6891 non-null float64
A_an4                6891 non-null float64
fMs_an4              6891 non-null float64
avg_FabsavgH_an4     6891 non-null float64
C_an4                6891 non-null float64
CnnC_an4             6891 non-null float64
da_len_w_diff_l      6891 non-null float64
da_len_w_diff_r      6891 non-null float64
da_len_w_diff_opp    6891 non-null float64
da_num_w_diff_l      6891 non-null float64
da_num_w_diff_r      6891 non-null float64
da_num_w_diff_opp    6891 non-null float64
A_diff               6891 non-null float64
fMs_diff             6891 non-null float64
dtypes: float64(19)
memory usage: 1.1 MB


# Linear Regression
- Best Subset, Forward Selection, or Lasso? (Ryan J. Tibshirani)
    - Best subset selection and forward selection usually perform similarly.
    - Lasso is preferred if low signal-to-noise ratio (SNR); Best subset if high SNR.

### Linear Regression, Vanilla

In [34]:
geo_topo.head(5)


Unnamed: 0,da_len_w_an4_l,da_len_w_an4_r,da_len_w_an4_opp,da_num_w_an4_l,da_num_w_an4_r,da_num_w_an4_opp,A_an4,fMs_an4,avg_FabsavgH_an4,C_an4,CnnC_an4,da_len_w_diff_l,da_len_w_diff_r,da_len_w_diff_opp,da_num_w_diff_l,da_num_w_diff_r,da_num_w_diff_opp,A_diff,fMs_diff
0,2.786,85.712,115.764,151.881,92.356,110.083,1.923,0.017,7.0,20.166,-33.323,13.156,31.745,-34.101,2.355,4.766,1.547,0.325,-3.0
1,127.445,106.867,120.184,128.177,116.21,115.611,603.052,20.749,0.034,6.0,-11.067,-5.556,0.907,-4.264,0.314,3.952,65.258,-2.069,-0.032
2,129.292,89.502,133.71,131.027,100.322,128.65,300.189,8.257,0.028,3.0,-38.663,55.015,-8.857,-40.102,37.114,2.988,15.309,4.152,0.271
3,147.915,120.642,100.673,147.624,111.746,100.629,610.399,16.601,0.027,4.0,-14.561,12.422,-7.091,-16.245,21.879,-5.634,-57.55,-2.802,0.049
4,115.396,106.136,138.583,115.81,107.572,136.618,182.763,20.186,0.11,11.0,5.505,22.783,-28.402,-4.09,26.851,-22.761,-115.252,-9.741,0.085


### Linear Regression, Lasso

In [54]:
from sklearn.linear_model import Lasso

# y = energy_grad.iloc[:, 0]
# X = energy_grad.iloc[:, 1:]

# X_da = pd.concat([energy_grad.iloc[:, 1], geo_topo.iloc[:, :5], other.iloc[:, 1:]], axis=1)
X_da = geo_topo.iloc[:, -3:-2]
# da = pd.concat([geo_topo.iloc[:, :3], geo_topo.iloc[:, 7:11]], axis=1)
da = geo_topo.iloc[:, -2]
y = da
X = X_da / X_da.max(axis=0) * 10

# X_dfms = pd.concat([energy_grad.iloc[:, 1], geo_topo.iloc[:, :5], other.iloc[:, 1:]], axis=1)
# dfms = geo_topo.iloc[:, 6]
# y = dfms
# X = X_dfms / X_dfms.max(axis=0)


lasso = Lasso(alpha=1)

lasso.fit(X, y)
print lasso.coef_
print 'R2 score: ', lasso.score(X, y)
# print 'accuracy: ', metrics.accuracy_score(lasso.predict(X), y)


[27.37249422]
R2 score:  0.3692247907001277


In [None]:
# plotCorrelationMatrixHalf(pd.concat([dfms, X_dfms, geo_topo.C_diff], axis=1), 'dFMs_corr.png')
plotFeatureImportance(X_dfms.max(axis=0).values, X.columns, 'X_dfms_max_values')

In [19]:
df = pd.concat([X, y], axis=1)

df.to_csv('X_abs_Adiff_y_geo_topo.csv', index=False)

### Linear Regression, Forward Selection
- Refs:
    - https://planspace.org/20150423-forward_selection_with_statsmodels/
    - https://xavierbourretsicotte.github.io/subset_selection.html

In [35]:
#  A_diff = 107.8 + 42.81*A_an4 + 43.38*A_an4^2

A_an4 = X.iloc[:,0]

# A_diff = 107.8 + 42.81*A_an4 + 43.38*(A_an4**2)
A_diff =  85.6 + 116*A_an4 + 44.07*(A_an4**2)

In [36]:
from sklearn.metrics import r2_score

r2_score(y, A_diff)





0.20411978686751153