In [1]:
import deeplake
from skimage.color import rgb2gray
import numpy as np
from PIL import Image
from sklearn import svm
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
from sklearn import metrics
import sklearn
import time
import sys
from IPython.display import clear_output
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow as tf
import wandb
from wandb.keras import WandbCallback
import matplotlib.pyplot as plt
from datetime import datetime
from collections import Counter
from itertools import chain
clear_output()

# Dataset for RF

In [2]:
val_ds = deeplake.load('hub://luizapzbn/CODEBRIM_classification_dataset-val')
train_ds = deeplake.load('hub://luizapzbn/CODEBRIM_classification_dataset-train')

hub://luizapzbn/CODEBRIM_classification_dataset-val loaded successfully.
This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/luizapzbn/CODEBRIM_classification_dataset-val
hub://luizapzbn/CODEBRIM_classification_dataset-train loaded successfully.
This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/luizapzbn/CODEBRIM_classification_dataset-train


# Loading in data

In [3]:
# train set
X_train = train_ds.images.numpy()
y_train = train_ds.labels.data()
train_labels = list(chain.from_iterable(train_ds.labels.data()['text']))

# validation set
X_val = val_ds.images.numpy()
y_val = val_ds.labels.data()
val_labels = list(chain.from_iterable(val_ds.labels.data()['text']))

# greying the pictures
X_train=rgb2gray(X_train)
# resampling the values from 0-255
X_train = (X_train*255).astype('uint8')

# greying the pictures
X_val=rgb2gray(X_val)
# resampling the values from 0-255
X_val = (X_val*255).astype('uint8')

# Manipulating training picture format for training Random Forest model

In [4]:
# flattening the picture array
X_train = X_train.reshape((len(X_train),-1))
X_train.shape

# labels turned into array
y_train_text = np.asarray(y_train['text']).ravel()
y_train_value = np.asarray(y_train['value']).ravel()

# Manipulating validation picture format for training Random Forest model

In [5]:
# flattening the picture array
X_val = X_val.reshape((len(X_val),-1))

# labels to array
y_val_text = np.asarray(y_val['text']).ravel()
y_val_value = np.asarray(y_val['value']).ravel()

# Random forest models - 6 classes

In [6]:
n_est = 100
max_d = None
min_s_s = 2
min_s_l = 1

In [7]:
# Create an instance of the RandomForestClassifier
clf = RandomForestClassifier(
    n_estimators=n_est,
    max_depth=max_d,
    min_samples_split=min_s_s,
    min_samples_leaf=min_s_l)

In [None]:
date_and_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
wandb.init(project="inlamning1V1", name=f"preprocessing {date_and_time}", config = {
    "training_type": 'Random Forest',
    'n_estimators': n_est,
    'max_depth': max_d,
    'min_samples_split': min_s_s,
    'min_samples_leaf': min_s_l,
    'scoring': 'accuracy'
})

start = time.time()

scores = cross_val_score(clf, X_train, y_train_value, cv=5, scoring='accuracy')

# fit / predict
clf.fit(X_train, y_train_value)

y_pred = clf.predict(X_val)
y_probas = clf.predict_proba(X_val)
labels = ['Background', 'Crack', 'Spallation', 'Efflorescence', 'ExposedBars', 'CorrosionStain']

# visualize model stats
wandb.log({'roc': wandb.plots.ROC(y_val_value, y_probas, labels)})
wandb.log({'pr': wandb.plots.precision_recall(y_val_value, y_probas, labels)})
wandb.sklearn.plot_summary_metrics(clf, X_train, y_train_value, X_val, y_val_value)
wandb.sklearn.plot_confusion_matrix(y_val_value, y_pred, labels)

print(sklearn.metrics.classification_report(y_val_value, y_pred))
print(time.time() - start, 'seconds')

for score in scores:
    wandb.log({'cross_val_score': score})
    
wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mmark-eszaros[0m. Use [1m`wandb login --relogin`[0m to force relogin


# RF binary classification

In [6]:
y_train_binary_value = np.where(y_train['value']!=[0], [1], [0])
print('Binary values: ', Counter(list(chain.from_iterable(y_train_binary_value))))

y_train_binary_text = np.where(np.array(y_train['text'])!=['Background'], ['Damaged'], ['Not damaged'])
print('Binary texts: ', Counter(list(chain.from_iterable(y_train_binary_text))))

Binary values:  Counter({1: 4253, 0: 2228})
Binary texts:  Counter({'Damaged': 4253, 'Not damaged': 2228})


In [7]:
y_val_binary_value = np.where(y_val['value']!=[0], [1], [0])
print('Binary values: ', Counter(list(chain.from_iterable(y_val_binary_value))))

y_val_binary_text = np.where(np.array(y_val['text'])!=['Background'], ['Damaged'], ['Not damaged'])
print('Binary texts: ', Counter(list(chain.from_iterable(y_val_binary_text))))

Binary values:  Counter({1: 461, 0: 155})
Binary texts:  Counter({'Damaged': 461, 'Not damaged': 155})


In [8]:
n_est = 50
max_d = None
min_s_s = 2
min_s_l = 1

In [9]:
# Create an instance of the RandomForestClassifier
clf = RandomForestClassifier(
    n_estimators=n_est,
    max_depth=max_d,
    min_samples_split=min_s_s,
    min_samples_leaf=min_s_l)

In [11]:
date_and_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
wandb.init(project="inlamning1V1", name=f"preprocessing {date_and_time}", config = {
    "training_type": 'Random Forest',
    'n_estimators': n_est,
    'max_depth': max_d,
    'min_samples_split': min_s_s,
    'min_samples_leaf': min_s_l,
    'scoring': 'accuracy'
})

# metrics = ['accuracy', 'balanced_accuracy', 'roc_auc', 'f1', 'neg_mean_absolute_error', 'neg_root_mean_squared_error', 'r2']

start = time.time()

scores = cross_val_score(clf, X_train, y_train_binary_text, cv=5, scoring='accuracy')

clf.fit(X_train, y_train_binary_value)

y_pred = clf.predict(X_val)
y_probas = clf.predict_proba(X_val)
labels = ['Not damaged', 'Damaged']

# visualize model
wandb.log({'roc': wandb.plots.ROC(y_val_binary_value, y_probas, labels)})
wandb.log({'pr': wandb.plots.precision_recall(y_val_binary_value, y_probas, labels)})
wandb.sklearn.plot_summary_metrics(clf, X_train, y_train_binary_value, X_val, y_val_binary_value)
wandb.sklearn.plot_confusion_matrix(y_val_binary_value, y_pred, labels)

print(sklearn.metrics.classification_report(y_val_binary_value, y_pred))
print(time.time() - start, 'seconds')

for score in scores:
    wandb.log({'cross_val_score': score})
    
wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mmark-eszaros[0m. Use [1m`wandb login --relogin`[0m to force relogin


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  clf.fit(X_train, y_train_binary_value)


              precision    recall  f1-score   support

           0       0.67      0.60      0.63       155
           1       0.25      0.81      0.39       149
           2       0.00      0.00      0.00       124
           3       0.00      0.00      0.00       112
           4       0.00      0.00      0.00        34
           5       0.00      0.00      0.00        42

    accuracy                           0.35       616
   macro avg       0.15      0.24      0.17       616
weighted avg       0.23      0.35      0.25       616

202.35309982299805 seconds


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0,1
cross_val_score,▁█

0,1
cross_val_score,0.69969
