In [1]:
from preprocess import get_img_paths, prep_data
import numpy as np

In [2]:
fractured, nfractured = get_img_paths('train')

In [3]:
x_1, y_1 = prep_data(fractured, 1)
x_0, y_0 = prep_data(nfractured, 0)
x_train = np.concatenate([x_0,x_1], axis=0)
x_train = np.expand_dims(x_train, axis=-1)
y_train = np.concatenate([y_0,y_1], axis=0)
x_train.shape, y_train.shape

((8538, 224, 224, 1), (8538, 1))

In [4]:
val_fractured, val_nfractured = get_img_paths('val')
x_1, y_1 = prep_data(val_fractured, 1)
x_0, y_0 = prep_data(val_nfractured, 0)
x_val = np.concatenate([x_0,x_1], axis=0)
x_val = np.expand_dims(x_val, axis=-1)
y_val = np.concatenate([y_0,y_1], axis=0)
x_val.shape, y_val.shape

((582, 224, 224, 1), (582, 1))

In [5]:
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error

In [6]:
x_train_flatten = x_train.reshape((x_train.shape[0], -1))
x_val_flatten = x_val.reshape((x_val.shape[0], -1))

### Experiments on different No. components

In [7]:
pca_list = []
mse_loss = []
for k in [5, 9, 17, 33, 65, 129, 257]:
    pca = PCA(n_components=k)
    x_train_pca = pca.fit_transform(x_train_flatten)
    x_val_pca = pca.transform(x_val_flatten)

    x_train_reconstructed = pca.inverse_transform(x_train_pca)
    x_val_reconstructed = pca.inverse_transform(x_val_pca)

    pca_list += [pca]

    mse_train = mean_squared_error(x_train.reshape(x_train.shape[0],-1), x_train_reconstructed)
    mse_val = mean_squared_error(x_val.reshape(x_val.shape[0], -1), x_val_reconstructed)

    print(f"MSE Loss on Training Data   with k = {k}: {mse_train}")
    print(f"MSE Loss on Validation Data with k = {k}: {mse_val}")

    mse_loss += [(mse_train, mse_val)]

MSE Loss on Training Data   with k = 5: 1079.109020476563
MSE Loss on Validation Data with k = 5: 1063.7529060022587
MSE Loss on Training Data   with k = 9: 852.5504921470124
MSE Loss on Validation Data with k = 9: 830.2617627467725
MSE Loss on Training Data   with k = 17: 617.8707684155164
MSE Loss on Validation Data with k = 17: 598.7764947979424
MSE Loss on Training Data   with k = 33: 434.20875186300617
MSE Loss on Validation Data with k = 33: 419.38930272345493
MSE Loss on Training Data   with k = 65: 299.0797401718659
MSE Loss on Validation Data with k = 65: 290.0863275265436
MSE Loss on Training Data   with k = 129: 195.60874227449398
MSE Loss on Validation Data with k = 129: 192.75559782549473
MSE Loss on Training Data   with k = 257: 120.00377971566296
MSE Loss on Validation Data with k = 257: 122.88104357057412


In [8]:
for pca in pca_list:
    print(f'Variance ratio: {sum(pca.explained_variance_ratio_)}')

Variance ratio: 0.4970481152884329
Variance ratio: 0.6026426721484058
Variance ratio: 0.7120223614241171
Variance ratio: 0.7976236821639033
Variance ratio: 0.8606047061565066
Variance ratio: 0.9088305410054139
Variance ratio: 0.9440685546731314


### Building plots

In [16]:
import pandas as pd
import plotly.express as px

# Create DataFrame for plotting
df_loss = pd.DataFrame({
    'Number of Components': [5, 9, 17, 33, 65, 129, 257],
    'Training Loss': [x[0] for x in mse_loss],
    'Validation Loss': [x[1] for x in mse_loss]
})

df_variance = pd.DataFrame({
    'Number of Components': [5, 9, 17, 33, 65, 129, 257],
    'Explained Variance Ratio': [sum(pca.explained_variance_ratio_) for pca in pca_list]
})

In [17]:
# Plot MSE loss
fig_loss = px.line(
    df_loss, 
    x='Number of Components', 
    y=['Training Loss', 'Validation Loss'], 
    title='MSE Loss vs. Number of PCA Components',
    labels={'value': 'MSE Loss', 'variable': 'Data'}
)

fig_loss.show()

In [18]:

# Plot Explained Variance Ratio
fig_variance = px.line(
    df_variance, 
    x='Number of Components', 
    y='Explained Variance Ratio', 
    title='Explained Variance Ratio vs. Number of PCA Components',
    labels={'Explained Variance Ratio': 'Explained Variance Ratio'}
)

fig_variance.show()


### Pipe PCA with Perceptron (completely linear methods) for classification task

In [13]:
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import plotly.express as px
import pandas as pd

In [None]:
x_train_pca = pca_list[-1].fit_transform(x_train_flatten)
x_val_pca = pca_list[-1].transform(x_val_flatten)

In [14]:
perceptron = Perceptron(max_iter=999999, early_stopping=True, validation_fraction=.25)
perceptron.fit(x_train_pca, y_train)

  y = column_or_1d(y, warn=True)


In [15]:
# Predict on the validation set
y_pred = perceptron.predict(x_val_pca)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)
class_report = classification_report(y_val, y_pred)

print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

# Plot confusion matrix using Plotly
conf_matrix_df = pd.DataFrame(conf_matrix, index=['Actual 0', 'Actual 1'], columns=['Predicted 0', 'Predicted 1'])
fig_conf_matrix = px.imshow(
    conf_matrix_df,
    text_auto=True,
    title='Confusion Matrix',
    labels={'x': 'Predicted Label', 'y': 'Actual Label'}
)
fig_conf_matrix.show()

Accuracy: 0.8092783505154639
Confusion Matrix:
[[341  75]
 [ 36 130]]
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.82      0.86       416
           1       0.63      0.78      0.70       166

    accuracy                           0.81       582
   macro avg       0.77      0.80      0.78       582
weighted avg       0.83      0.81      0.81       582

