In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
import glob
import seaborn as sns
import skimage.io as io
from skimage import data_dir
import pandas as pd
import split_folders
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from skimage.io import imread_collection
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
import random

## Flatten Images

In [2]:
#this function reads in images and then flattens them into a dataframe
def flat_images(path):
    images=io.ImageCollection(path)
    flat_images=[]
    for i in range(len(images)):
        img=images[i]
        img1=img.flatten()
        flat_images.append(img1)
    df=pd.DataFrame(flat_images)
    return df

### add in your own paths below - example directory setup

In [20]:
#flattens ditylum images
path= r'C:\Users\vrm8601\Documents\Pre Prac\all_new_dit\*.tif'
df_1=flat_images(path)
df_1=df_1.fillna(0)
df_1['label']=1

In [3]:
#flattens non-ditylum images
#path1= r'C:\Users\vrm8601\Documents\Pre Prac\0_notditylum\0\*.tif'
path1= r'C:\Users\vrm8601\Documents\Pre Prac\Old\notsample\*.tif'
df_0=flat_images(path1)
df_0=df_0.fillna(0)
df_0['label']=0

In [4]:
df_0

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,249991,249992,249993,249994,249995,249996,249997,249998,249999,label
0,0.031428,0.014565,0.004832,0.004424,0.00654,0.001541,0.0,0.0,0.0,0.0,...,0.003222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.005272,0.026451,0.026451,0.037895,0.037895,0.036994,0.036994,0.030659,0.030659,0
2,0.001037,0.001037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.031389,0.035728,0.035728,0.03841,0.03841,0.030251,0.030251,0.015698,0.015698,0
3,0.009627,0.009627,0.009138,0.009138,0.009088,0.009088,0.008865,0.008865,0.01079,0.01079,...,0.038315,0.034506,0.034506,0.039454,0.039454,0.038032,0.038032,0.028364,0.028364,0
4,0.079859,0.079859,0.104034,0.104034,0.114063,0.114063,0.126071,0.126071,0.127322,0.127322,...,0.049456,0.060916,0.060916,0.060675,0.060675,0.055401,0.055401,0.051653,0.051653,0
5,0.036249,0.036249,0.042131,0.042131,0.050882,0.050882,0.054039,0.054039,0.042473,0.042473,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
6,0.20916,0.20916,0.231476,0.231476,0.240242,0.240242,0.243794,0.243794,0.23515,0.23515,...,0.0,0.010645,0.010645,0.024924,0.024924,0.041963,0.041963,0.054108,0.054108,0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [32]:
#puts both sets of images in same dataframe
df=df_1.append(df_0)
df.reset_index(inplace=True,drop=True)
df=df.fillna(0)

MemoryError: Unable to allocate 7.09 GiB for an array with shape (547600, 1738) and data type float64

In [None]:
# create test and train data (80/20 split)
X=df.drop(['label'],axis=1)
y=df.label
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=.2,random_state=42)

In [None]:
# tip: always run PCA after the split only on train data
pca=PCA()
pca.fit(X_train)
csum=np.cumsum(pca.explained_variance_ratio_)
d=np.argmax(csum>=.99)+1


In [None]:
pca=PCA(n_components=d)
X_train_reduced=pca.fit_transform(X_train)
print(X_train.shape, X_train_reduced.shape) 

### training set size is reduced from 547,600 to 482 keeping 99% of variance

In [None]:
# reduce test set too for metrics later - not in the model itself
X_test_reduced=pca.transform(X_test)
print(X_test.shape, X_test_reduced.shape)

## MLP Classifier - multilayer perceptron - feed forward neural net based on logistic regression

In [None]:
# set parameters to perform a grid search and MLP classifier
params={'max_iter':[500],'alpha':[.00001,.001,1],'hidden_layer_sizes':[(100,100,100),(10,10,10)],'random_state':[42]}
grid=GridSearchCV(MLPClassifier(),params,refit=True,verbose=3,cv=5)
grid.fit(X_train_reduced,y_train)

In [None]:
# predicted based off of the model
y_pred=grid.predict(X_test_reduced)

In [None]:
# metrics
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

## MLP Classifier - Scaled First

In [31]:
## MLP Classifier
scaler = StandardScaler()
# Fit on training set only.
scaler.fit(X_train)
# Apply transform to both the training set and the test set.
X_train_transform = scaler.transform(X_train)
X_test_transform = scaler.transform(X_test)
# perform PCA
pca = PCA(.99)#.995
pca.fit(X_train_transform)
X_train_transform = pca.transform(X_train_transform)
X_test_transform = pca.transform(X_test_transform)

In [None]:
# set parameters to perform a grid search and MLP classifier
params={'max_iter':[500],'alpha':[.00001,.001,1],'hidden_layer_sizes':[(100,100,100),(10,10,10)],'random_state':[42]}
grid=GridSearchCV(MLPClassifier(),params,refit=True,verbose=3,cv=5)
grid.fit(X_train_transform,y_train)
# predicted based off of the model
y_pred=grid.predict(X_test_transform)

In [None]:
# metrics
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

## Basic Logistic Regression - w/ only PCA not scaling

In [27]:
logReg = LogisticRegression()
logReg.fit(X_train_reduced,y_train)
y_pred=logReg.predict(X_test_reduced)



In [28]:
# metrics
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

0.7052631578947368
[[66  0]
 [56 68]]
              precision    recall  f1-score   support

           0       0.54      1.00      0.70        66
           1       1.00      0.55      0.71       124

    accuracy                           0.71       190
   macro avg       0.77      0.77      0.71       190
weighted avg       0.84      0.71      0.71       190



## Basic Logistic Regression - w/ PCA and Scaling

In [32]:
# Fit on training set only.
scaler.fit(X_train)
# Apply transform to both the training set and the test set.
X_train_transform = scaler.transform(X_train)
X_test_transform = scaler.transform(X_test)

In [33]:
logReg = LogisticRegression()
logReg.fit(X_train_transform,y_train)
y_pred=logReg.predict(X_test_transform)



In [34]:
# metrics
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

1.0
[[ 66   0]
 [  0 124]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        66
           1       1.00      1.00      1.00       124

    accuracy                           1.00       190
   macro avg       1.00      1.00      1.00       190
weighted avg       1.00      1.00      1.00       190

