In [1]:
import os
import pandas as pd
import numpy as np
import preprocessing as proc
from pandas.api.types import CategoricalDtype
import seaborn as sns
from matplotlib import pyplot as plt

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

%matplotlib inline

plt.style.use('seaborn')
plt.rcParams['figure.figsize'] = (15, 5)
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['axes.labelsize'] = 16
plt.rcParams['xtick.labelsize'] = 14
plt.rcParams['ytick.labelsize'] = 14

pd.options.display.max_columns = 1000

DATA_PATH = '../cell-profiler/measurements'
SUFFIX = ''
intensity = True
texture = True
zernike = False

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'



In [4]:
measurements = proc.cell_data(data_path=DATA_PATH, suffix=SUFFIX, cytoplasm=False, biomarkers=False)

Total number of cells processed: 612.0

The numbers of cells and nuclei correspond to each other
The are no repeated column names: True
The are no repeated column names: True
Membrane features: (612, 111)
Chromatin features: (612, 184)
Full dataset has shape: (612, 288)


In [5]:
measurements = proc.clean_data(measurements, intensity)

Initial shape is: (612, 288)
Features with zero variance:
 Index(['centerZ_cell', 'eulernumber_cell', 'children_cytoplasm_count_cell',
       'loc_centermassintensityZ_wga_cell', 'loc_centerZ_cell',
       'loc_maxintensityZ_wga_cell', 'centerZ_nucl', 'eulernumber_nucl',
       'children_cytoplasm_count_nucl', 'loc_centermassintensityZ_dapi',
       'loc_centermassintensityZ_wga_nucl', 'loc_centerZ_nucl',
       'loc_maxintensityZ_dapi', 'loc_maxintensityZ_wga_nucl'],
      dtype='object')

After cleaning the dataset has 612 rows and 254 columns.



## Feature selection

### Explore groups of features

In [None]:
list(measurements.columns)

In [None]:
feature_group = []

###### Cell Shape

In [None]:
cell_shape = measurements.loc[:, 'area_cell' : 'solidity_cell']
print("Number of cell shape features:", cell_shape.shape[1])

In [None]:
correlations = cell_shape.corr();
mask = np.zeros_like(correlations)
mask[abs(correlations) < 0.9] = 1

sns.set(rc={'figure.figsize':(12, 10)})

sns.heatmap(correlations, 
            mask=mask, annot=True,
            vmin=-1, vmax=1,
            cmap=sns.color_palette("RdBu_r", 100));

In [None]:
cell_shape.drop(['majoraxislength_cell', 'maxferetdiameter_cell', 
                 'maximumradius_cell', 'medianradius_cell', 
                 'minferetdiameter_cell'], axis=1, inplace=True)

print("Selected {} cell shape features:".format(cell_shape.shape[1]))
feature_group.extend([1] * cell_shape.shape[1])

###### Cell Zernike

In [None]:
cell_zern = measurements.loc[:, 'zernike_0_0_cell' : 'zernike_9_9_cell']
print("Number of cell zernike features:", cell_zern.shape[1])

In [None]:
correlations = cell_zern.corr();
mask = np.zeros_like(correlations)
mask[abs(correlations) < 0.9] = 1

sns.set(rc={'figure.figsize':(12, 10)})

sns.heatmap(correlations, 
            mask=mask, annot=False,
            vmin=-1, vmax=1,
            cmap=sns.color_palette("RdBu_r", 100));

In [None]:
print("Selected {} cell zernike features:".format(cell_zern.shape[1]))
feature_group.extend([1] * cell_zern.shape[1])

###### Cell Intensity

In [None]:
cell_int = measurements.loc[:, 'integratedintensityedge_wga_cell' : 'upperquartileintensity_wga_cell'] 

print("Number of cell intensity features:", cell_int.shape[1])

In [None]:
correlations = cell_int.corr();
mask = np.zeros_like(correlations)
mask[abs(correlations) < 0.9] = 1

sns.set(rc={'figure.figsize':(12, 10)})

sns.heatmap(correlations, 
            mask=mask, annot=True,
            vmin=-1, vmax=1,
            cmap=sns.color_palette("RdBu_r", 100));

In [None]:
cell_int.drop(['medianintensity_wga_cell', 'upperquartileintensity_wga_cell', 
                     'maxintensityedge_wga_cell', 'minintensity_wga_cell'], axis=1, inplace=True)

print("Selected cell intensity features:", cell_int.shape[1])
feature_group.extend([2] * cell_int.shape[1])

###### Cell Neighbours

In [None]:
cell_neighb = measurements.loc[:, 'anglebetweenneighbors_cell' : 'secondclosestdistance_cell']

print("Number of cell neighbours features:", cell_neighb.shape[1])

In [None]:
correlations = cell_neighb.corr();
mask = np.zeros_like(correlations)
mask[abs(correlations) < 0.9] = 1

sns.set(rc={'figure.figsize':(12, 10)})

sns.heatmap(correlations, 
            mask=mask, annot=True,
            vmin=-1, vmax=1,
            cmap=sns.color_palette("RdBu_r", 100));

In [None]:
print("Selected cell neighbours features:", cell_neighb.shape[1])
feature_group.extend([3] * cell_neighb.shape[1])

###### Cell Texture

In [None]:
cell_tex = measurements.loc[:, 'angularsecondmoment_wga_00_cell' : 'variance_wga_03_cell']

print("Number of cell texture features:", cell_tex.shape[1])

In [None]:
correlations = cell_tex.corr();
mask = np.zeros_like(correlations)
mask[abs(correlations) < 0.9] = 1

sns.set(rc={'figure.figsize':(12, 10)})

sns.heatmap(correlations, 
            mask=mask, annot=False,
            vmin=-1, vmax=1,
            cmap=sns.color_palette("RdBu_r", 100));

In [None]:
cell_tex = measurements[['angularsecondmoment_wga_00_cell', 'contrast_wga_00_cell', 
                         'correlation_wga_01_cell', 'correlation_wga_02_cell',
                         'differencevariance_wga_00_cell', 'entropy_wga_00_cell', 
                         'infomeas1_wga_00_cell', 'infomeas1_wga_02_cell', 
                         'infomeas2_wga_00_cell', 'sumaverage_wga_00_cell', 
                         'variance_wga_00_cell']]

print("Selected cell texture features:", cell_tex.shape[1])
feature_group.extend([4] * cell_tex.shape[1])

###### Nuclear Shape

In [None]:
nucl_shape = measurements.loc[:,  'area_nucl' : 'solidity_nucl']

print("Number of nuclear shape features:", nucl_shape.shape[1])

In [None]:
correlations = nucl_shape.corr();
mask = np.zeros_like(correlations)
mask[abs(correlations) < 0.9] = 1

sns.set(rc={'figure.figsize':(12, 10)})

sns.heatmap(correlations, 
            mask=mask, annot=True,
            vmin=-1, vmax=1,
            cmap=sns.color_palette("RdBu_r", 100));

In [None]:
nucl_shape.drop(['maxferetdiameter_nucl', 'maximumradius_nucl', 
                 'medianradius_nucl', 'minferetdiameter_nucl', 
                 'minoraxislength_nucl', 'perimeter_nucl'], axis=1, inplace=True)

print("Selected nuclear shape features:", nucl_shape.shape[1])
feature_group.extend([5] * nucl_shape.shape[1])

###### Nuclear Zernike

In [None]:
nucl_zern = measurements.loc[:, 'zernike_0_0_nucl' : 'zernike_9_9_nucl']
print("Number of nuclear zernike features:", nucl_zern.shape[1])

In [None]:
correlations = nucl_zern.corr();
mask = np.zeros_like(correlations)
mask[abs(correlations) < 0.9] = 1

sns.set(rc={'figure.figsize':(12, 10)})

sns.heatmap(correlations, 
            mask=mask, annot=False,
            vmin=-1, vmax=1,
            cmap=sns.color_palette("RdBu_r", 100));

In [None]:
print("Selected {} nuclear zernike features:".format(nucl_zern.shape[1]))
feature_group.extend([1] * nucl_zern.shape[1])

###### Nuclear Intensity

In [None]:
nucl_int = measurements.loc[:, 'integratedintensityedge_dapi' : 'upperquartileintensity_wga_nucl'] 

print("Number of nuclear intensity features:", nucl_int.shape[1])

In [None]:
dapi_columns = [col for col in nucl_int.columns if 'dapi' in col]
wga_columns = [col for col in nucl_int.columns if 'wga' in col]

In [None]:
correlations = nucl_int.corr();
mask = np.zeros_like(correlations)
mask[abs(correlations) < 0.9] = 1

sns.set(rc={'figure.figsize':(12, 10)})

sns.heatmap(correlations, 
            mask=mask, annot=False,
            vmin=-1, vmax=1,
            cmap=sns.color_palette("RdBu_r", 100));

In [None]:
nucl_int.drop(['lowerquartileintensity_dapi', 'medianintensity_dapi', 
               'upperquartileintensity_dapi', 'minintensityedge_dapi', 
               'minintensity_dapi', 
               'lowerquartileintensity_wga_nucl', 'medianintensity_wga_nucl', 
               'upperquartileintensity_wga_nucl', 'maxintensityedge_wga_nucl', 
               'meanintensityedge_wga_nucl'],
              axis=1, inplace=True)

print("Seleted nuclear intensity features:", nucl_int.shape[1])
feature_group.extend([6] * nucl_int.shape[1])

###### Nuclei neighbours

In [None]:
nucl_neighb = measurements.loc[:, 'anglebetweenneighbors_nucl' :'secondclosestdistance_nucl']

print("Number of nuclear neighbours features:", nucl_neighb.shape[1])

In [None]:
correlations = nucl_neighb.corr();
mask = np.zeros_like(correlations)
mask[abs(correlations) < 0.9] = 1

sns.set(rc={'figure.figsize':(12, 10)})

sns.heatmap(correlations, 
            mask=mask, annot=True,
            vmin=-1, vmax=1,
            cmap=sns.color_palette("RdBu_r", 100));

In [None]:
nucl_neighb.drop(['percenttouching_nucl'], axis=1, inplace=True)

print("Seleted nuclear neighbours features:", nucl_neighb.shape[1])
feature_group.extend([7] * nucl_neighb.shape[1])

###### Nuclear Texture

In [None]:
nucl_tex = measurements.loc[:, 'angularsecondmoment_dapi_00' : 'variance_wga_03_nucl']

print("Number of nuclear texture features:", nucl_tex.shape[1])

In [None]:
dapi_columns = [col for col in nucl_tex.columns if 'dapi' in col]
wga_columns = [col for col in nucl_tex.columns if 'wga' in col]

In [None]:
dapi_columns

In [None]:
correlations = nucl_tex.corr();
mask = np.zeros_like(correlations)
mask[abs(correlations) < 0.9] = 1

sns.set(rc={'figure.figsize':(12, 10)})

sns.heatmap(correlations, 
            mask=mask, annot=False,
            vmin=-1, vmax=1,
            cmap=sns.color_palette("RdBu_r", 100));

In [None]:
nucl_tex = measurements[['angularsecondmoment_dapi_00',
                         'contrast_dapi_00', 'contrast_dapi_01', 
                         'contrast_dapi_02',
                         'correlation_dapi_00', 'correlation_dapi_01', 
                         'correlation_dapi_02', 'correlation_dapi_03', 
                         'differenceentropy_dapi_00', 'differencevariance_dapi_00',
                         'entropy_dapi_00', 'infomeas1_dapi_00', 
                         'infomeas2_dapi_00', 'sumaverage_dapi_00', 
                         'variance_dapi_00', 
                         'angularsecondmoment_wga_00_nucl', 
                         'contrast_wga_00_nucl', 
                         'correlation_wga_00_nucl', 'correlation_wga_01_nucl', 
                         'correlation_wga_02_nucl', 'correlation_wga_03_nucl', 
                         'differenceentropy_wga_00_nucl', 'differencevariance_wga_00_nucl',
                         'entropy_wga_00_nucl', 'infomeas1_wga_00_nucl', 
                         'infomeas2_wga_00_nucl', 'inversedifferencemoment_wga_00_nucl',
                         'sumaverage_wga_00_nucl', 'variance_wga_00_nucl']]

print("Seleted nuclear texture features:", nucl_tex.shape[1])
feature_group.extend([8] * nucl_tex.shape[1])

###### Distances

In [None]:
dist = measurements.loc[:, 'fartherstpoint_cell' :'nucleusshift']

print("Number of distance measurement features:", dist.shape[1])

In [None]:
correlations = dist.corr();
mask = np.zeros_like(correlations)
mask[abs(correlations) < 0.9] = 1

sns.set(rc={'figure.figsize':(12, 10)})

sns.heatmap(correlations, 
            mask=mask, annot=True,
            vmin=-1, vmax=1,
            cmap=sns.color_palette("RdBu_r", 100));

In [None]:
print("Seleted distance measurement features:", dist.shape[1])
feature_group.extend([9] * dist.shape[1])

### Combine into a new data frame

In [None]:
# Dataset with reduced number of features
feature_group = np.asarray([feature_group])

measurements_ = pd.concat([measurements.loc[:, 'label' : 'well'],
                           cell_shape, cell_zern, cell_neighb,
                           nucl_shape, nucl_zern, nucl_neighb,
                           dist], axis=1)

measurements_['stiffness_num'] = pd.to_numeric(measurements_.stiffness)

measurements_.shape

In [None]:
# Define feature columns
numeric_cols = measurements_.select_dtypes(include=[np.number]).columns
feature_cols = measurements_.select_dtypes(include=[np.number]).drop(['stiffness_num'], axis=1).columns

### Correlation between features

In [None]:
p_corr = measurements_.corr()

mask = np.zeros_like(p_corr)
mask[abs(p_corr) < 0.9] = 1

sns.heatmap(p_corr, vmin=-1, vmax=1, 
            mask=mask,
            cmap=sns.color_palette("RdBu_r", 100),
            xticklabels=False, yticklabels=True);

In [None]:
[col for col in p_corr.columns if (p_corr[col] >= 0.9).sum() > 1]

In [None]:
p_corr[p_corr.extent_cell > 0.9]

In [None]:
measurements_.drop(['zernike_0_0_cell'], axis=1, inplace=True)

In [None]:
measurements_.drop(['sumaverage_wga_00_cell',
                    'variance_wga_00_cell', 
                    'sumaverage_dapi_00',
                    'sumaverage_wga_00_nucl', 
                    'variance_dapi_00', 
                    'variance_wga_00_nucl'], 
                   axis=1, inplace=True)

In [None]:
sns.set(rc={'figure.figsize':(15, 12)})

p_corr = measurements_.corr()

sns.heatmap(p_corr, cmap=sns.color_palette("RdBu_r", 100), vmin=-1, vmax=1, 
            xticklabels=False, yticklabels=False);

plt.savefig('../results/Morph feature selection corr heatmap.png', bbox_inches='tight', dpi=300);

In [None]:
measurements_.shape

### Correlation with stiffness

In [None]:
sns.set(rc={'figure.figsize':(15, 2)})

sns.heatmap(p_corr[['stiffness_num']].T, 
            vmin=-1, vmax=1, 
            cmap=sns.color_palette("RdBu_r", 100));

## Other approaches

###### Regress `medianintensity_wga_cell` on `area_cell` and `meanintensity_wga_cell`

In [None]:
initial_set = ['area_cell', 'meanintensity_wga_cell']
Xk = np.array(measurements[initial_set])
x = np.array(measurements.medianintensity_wga_cell)
Xk.shape, x.shape

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

lin_reg.fit(Xk, x)
print(lin_reg.intercept_, lin_reg.coef_)
x_hat = lin_reg.predict(Xk)
u = x - x_hat

In [None]:
print(correlations.loc['area_cell', 'medianintensity_wga_cell'])
plt.plot(Xk[:, 0], label='Initial set: Area cell');
plt.plot(x, label='New feature: Median intensity');
plt.legend();

In [None]:
print(correlations.loc['meanintensity_wga_cell', 'medianintensity_wga_cell'])
plt.plot(Xk[:, 1], label='Initial set: Mean intensity');
plt.plot(x, label='New feature: Median intensity');
#plt.plot(x_hat, label='Prediction');
#plt.plot(u, label='Residual');
plt.legend();

In [None]:
plt.plot(x, label='New feature: Median intensity');
plt.plot(x_hat, label='Prediction');
plt.plot(u, label='Residual');
plt.legend();

###### Create an artificial response variable and compare $R^2$

In [None]:
def step_wise_regression(X):
    from sklearn.linear_model import LinearRegression
    
    y = X.sum(axis=1)
    print(X.shape, y.shape)

    lin_reg = LinearRegression()
    r2 = []
    for k in range(1,X.shape[1]):
        lin_reg.fit(X.iloc[:, 0:k], y)
        r2.append(lin_reg.score(X.iloc[:, 0:k], y))

    plt.plot(r2);

In [None]:
step_wise_regression(measurements[feature_cols])

###### Delete columns that are highly correlated with others

In [None]:
def select_low_corr(X, y):
    corr = X.corr();
    cols = np.full((corr.shape[0],), True, dtype=bool)
    
    for i in range(corr.shape[0]):
        for j in range(i+1, corr.shape[0]):
            if abs(corr.iloc[i,j]) >= 0.9:
                if cols[j]:
                    cols[j] = False
    selected_cols = X.columns[cols]
    return selected_cols

In [None]:
selected_cols = select_low_corr(measurements_[feature_cols], measurements_.stiffness)
[measurements_.drop([col], axis=1, inplace=True) for col in feature_cols if col not in selected_cols]

In [None]:
print("After feature selection the dataset has {} rows and {} columns.".format(measurements_.shape[0], 
                                                                               measurements_.shape[1]))
print(measurements_.dtypes.value_counts())
measurements_.head()

In [None]:
# Redefine numeric columns and feature columns
numeric_cols = measurements_.select_dtypes(include=[np.number]).columns
feature_cols = measurements_.select_dtypes(include=[np.number]).drop(['stiffness_num'], axis=1).columns

## Normalisation

In [None]:
measurements_norm = measurements_.copy()
# Standardise the entire dataset
measurements_norm[feature_cols] = (measurements_norm[feature_cols] - measurements_norm[feature_cols].mean()) / measurements_norm[feature_cols].std()
measurements_norm.head(3)

### Visualise all the features

In [None]:
sns.set(rc={'figure.figsize':(15, 12)})

sns.heatmap(measurements_norm[feature_cols],
            vmin = -1.5, vmax = 1.5,
            cmap=sns.color_palette("RdBu_r", 100),
            cbar_kws={"aspect": 20},
            xticklabels=False, yticklabels=False);

#plt.savefig('../results/Morph feature selection heatmap.png', bbox_inches='tight', dpi=300);

### Visualise aggregated values

In [None]:
mean_per_stiffness = measurements_norm.groupby('stiffness_num')[feature_cols].mean()
mean_per_stiffness

In [None]:
sns.set(rc={'figure.figsize':(15, 3)})

sns.heatmap(mean_per_stiffness, 
            vmin = -1.5, vmax = 1.5,
            cmap=sns.color_palette("RdBu_r", 100), 
            cbar_kws={"aspect": 5},
            xticklabels=False, yticklabels=False);
plt.ylabel('')

#plt.savefig('../results/Morph feature selection agg heatmap.png', bbox_inches='tight', dpi=300);

#### Colour code  groups of features

In [None]:
last_colour = sns.color_palette("Set3", 10)[9]
my_palette = sns.color_palette("Set3", 9)
my_palette[8] = last_colour

In [None]:
sns.set(rc={'figure.figsize':(15, 0.3)})

sns.heatmap(feature_group, 
            cbar=True, cmap=my_palette, 
            xticklabels=False, yticklabels=False);

plt.savefig('../results/Feature groups cbar.png', bbox_inches='tight', dpi=300);

## Clustering

> `sns.clustermap(metric="correlation")` doesn't work. Will getting rid of redundant features help?

In [None]:
sns.clustermap(measurements_norm[feature_cols], 
               metric='euclidean', method='ward', 
               col_cluster=False,
               cmap=sns.color_palette('RdBu_r', 100), robust=True);

In [None]:
# Prepare a vector of colours mapped to the 'stiffness' column
my_palette = dict(zip(measurements_norm.stiffness.unique(), sns.color_palette("Set3", 7)))
row_colors = measurements_norm.stiffness.map(my_palette)

print(list(my_palette.keys()))
sns.palplot(my_palette.values())

In [None]:
sns.clustermap(measurements_norm[feature_cols], 
               metric='euclidean', method='ward', 
               col_cluster=False,
               cmap=sns.color_palette('RdBu_r', 100), robust=True, 
               row_colors=row_colors);

#plt.savefig('../results/Clustering.png', bbox_inches='tight', dpi=300);

## PCA

In [None]:
from sklearn.preprocessing import StandardScaler

# Separating out the features
X = measurements_.loc[:, feature_cols].values

# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_scaled.shape

In [None]:
from sklearn.decomposition import PCA

n_components = 50
# Create a PCA object
pca = PCA(n_components=n_components)

# Apply PCA to the scaled feature matrix
principal_components = pca.fit_transform(X_scaled)

pc_cols = [('pc_' + str(i)) for i in range(1, n_components + 1)]
pc_df = pd.concat([measurements_.loc[:, 'label' : 'well'],
                  pd.DataFrame(data = principal_components, columns = pc_cols)], 
                  axis=1)

pc_df.shape

In [None]:
plt.rcParams['figure.figsize'] = (12, 5)

plt.plot(X_scaled[0]);

In [None]:
print(pca.explained_variance_.shape)
print("Percentage of explained variance:\n", 100 - pca.explained_variance_)
n_pc = sum(100 - pca.explained_variance_ < 98.0) + 1
print("\nThe first {} principal components explain {} variance".
      format(n_pc, 100 - pca.explained_variance_[n_pc-1]))

In [None]:
plt.plot(pca.explained_variance_);

#plt.savefig('../results/Explained variance.png', bbox_inches='tight', dpi=300);

In [None]:
pc_df[pc_cols].T.plot(legend=False);

#plt.savefig('../results/Principal components.png', bbox_inches='tight', dpi=300);

In [None]:
plt.plot(pca.components_[0]);

In [None]:
first_component = zip(feature_cols, pca.components_[0])

for col, weight in first_component:
    if abs(weight) > 0.2:
        print(col, weight)

In [None]:
my_palette = dict(zip(pc_df.stiffness.unique(), sns.color_palette("Set3", 7)))
row_colors = pc_df.stiffness.map(my_palette)

sns.clustermap(pc_df.loc[:,'pc_1' : 'pc_' + str(n_pc)], 
               metric='euclidean', method='ward', 
               col_cluster=False,
               cmap=sns.color_palette('RdBu_r', 40), robust=True, 
               row_colors=row_colors);

#plt.savefig('../results/PCA Clustering 50.png', bbox_inches='tight', dpi=300);

In [None]:
print(list(my_palette.keys()))
sns.palplot(my_palette.values())

### Random Forest Classifier

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score


X = measurements_[feature_cols]
y = measurements_.stiffness
 
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train.shape, X_test.shape

In [None]:
#scaler = StandardScaler()
#X_train_scaled = scaler.fit_transform(X_train)
#X_test_scaled = scaler.transform(X_test)

In [None]:
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train, y_train)

In [None]:
y_pred = rfc.predict(X_test)
print("Testing accuraccy:", accuracy_score(y_test, y_pred))
print("Testing f-score:", f1_score(y_test, y_pred, average='weighted'))

In [None]:
np.unique(y_pred)

In [None]:
feature_importance = zip(feature_cols, rfc.feature_importances_)

for col, weight in feature_importance:
    if (weight > 0.02):
        print(col, weight)

### Random Forest Regressor

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


X = measurements_[feature_cols]
y = measurements_.stiffness_num
 
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train.shape, X_test.shape

In [None]:
#scaler = StandardScaler()
#X_train_scaled = scaler.fit_transform(X_train)
#X_test_scaled = scaler.transform(X_test)

In [None]:
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)

In [None]:
y_pred = rfr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
np.sqrt(mse)

In [None]:
feature_importance = zip(measurements_.columns, rfr.feature_importances_)

for col, weight in feature_importance:
    if (weight > 0.025):
        print(col, weight)

### k-Means

In [None]:
X = measurements_[feature_cols]
#X = measurements_syn.loc[measurements_syn.stiffness != "8.0", feature_cols]

X.shape

In [None]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, f1_score
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
km = KMeans(n_clusters=3, random_state=1)
km.fit(X_scaled)

measurements_['cluster'] = km.labels_
#measurements_syn.loc[measurements_syn.stiffness != "8.0",'cluster'] = km.labels_

In [None]:
sns.scatterplot(data=measurements_, x='stiffness', y='cluster', 
                palette=sns.color_palette("husl", 7), 
                alpha=0.1, s=100);

In [None]:
pd.crosstab(measurements_.stiffness, 
            measurements_.cluster).plot(kind='bar');
plt.xlabel("Stiffness")
plt.ylabel("Count")
plt.title("Number of Cells from Each Cluster");
#plt.savefig('../results/Clustered Cells.png', bbox_inches='tight', dpi=300);

In [None]:
pd.crosstab(measurements_syn[measurements_syn.stiffness != "8.0"].stiffness, 
            measurements_syn[measurements_syn.stiffness != "8.0"].cluster).plot(kind='bar');
plt.xlabel("Stiffness")
plt.ylabel("Count")
plt.title("Number of Cells from Each Cluster");

In [None]:
x