# Scattering Net + XGBoost

In [None]:
import torch, torchvision
import sys, os
import matplotlib.pyplot as plt
import numpy as np

from torchvision import transforms, datasets
from matplotlib import cm
from torch.utils.data.dataset import TensorDataset
from torch.utils.data.dataloader import DataLoader

from kymatio import Scattering2D
from kymatio import HarmonicScattering3D

import time

In [None]:
torch.cuda.is_available()

In [None]:
data_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])
flip_axes = lambda tens: tens.permute(1, 2, 0)
semicond_dataset = datasets.ImageFolder(root='/home/sutd/Documents/Workplace/DLCourse/MATH6380P/semi/train/train_contest', 
                                        transform=transforms.Compose(
                                            [
                                                transforms.CenterCrop((260,260)),
                                                transforms.ToTensor()
                                            ]
                                        )
                                       )
dataset_loader = torch.utils.data.DataLoader(semicond_dataset,
                                             batch_size=16, shuffle=True,
                                             num_workers=4)

In [None]:
def imshow(inp, title=None, normalize=True):
    """Imshow for Tensor."""
    inp = inp.numpy().transpose((1, 2, 0))
    if normalize:
        mean = np.array([0.485, 0.456, 0.406])
        std = np.array([0.229, 0.224, 0.225])
        inp = std * inp + mean
        
    inp = np.clip(inp, 0, 1)
    plt.imshow(inp, interpolation="bilinear", cmap=cm.RdYlGn, aspect="auto")
    if title is not None:
        plt.title(title)
    plt.pause(0.001)

In [None]:
batch_id, (features, labels) = next(enumerate(dataset_loader))
len(dataset_loader)

In [None]:
features.shape

In [None]:
feature_grid = torchvision.utils.make_grid(features, nrow=4)

In [None]:
feature_grid.shape

In [None]:
imshow(feature_grid, title=[x for x in labels], normalize=False)

In [None]:
from sklearn import preprocessing

In [None]:
scaler = preprocessing.StandardScaler()

In [None]:
dataset_loader_new = torch.utils.data.DataLoader(semicond_dataset,
                                                batch_size=100, shuffle=True,
                                                num_workers=10)

In [None]:
batch_id, (features, labels) = next(enumerate(dataset_loader_new))

In [None]:
import time
times = 10

In [None]:
scattering = HarmonicScattering3D(J=2, shape=(features.shape[1], features.shape[2], features.shape[3]), L=2)

In [None]:
t_start = time.time()

scattering.cuda()
features = features.cuda()

for _ in range(times):
    scattering(features)
    
torch.cuda.synchronize()

t_elapsed = time.time() - t_start

fmt_str = 'Elapsed time: {:2f} [s / {:d} evals], avg: {:.2f} (s/batch)'
print(fmt_str.format(t_elapsed, times, t_elapsed/times))

In [None]:
del(scattering)

In [None]:
torch.cuda.empty_cache()

## Feature extraction using Scattering Net

In [None]:
dataloader = DataLoader(semicond_dataset, batch_size=100, shuffle=True, num_workers=10)
scattering = HarmonicScattering3D(J=2, shape=(features.shape[1], features.shape[2], features.shape[3]), L=2)
scattering.cuda()

In [None]:
feat_coll = []
label_coll = []
save_to_disk = True
train = True

for batch_id, [features, labels] in enumerate(dataloader):
    # sample is a list with the first element corresponding to the images
    print("Batch {}, features shape: {}, labels shape: {}".format(batch_id, features.shape, labels.shape))
    features = features.cuda()

    t1 = time.time()
    out = scattering(features)
    t2 = time.time()
    print("Output shape: {}, Time taken: {}".format(out.shape, t2 - t1))

    # move output, features and labels back to the CPU to prevent a memory leak and release memory from GPU
    out = out.to("cpu")
    features = features.to("cpu")
    # do not need to move labels to GPU because we are not doing any computation on them
    # labels = labels.to("cpu")

    out = torch.flatten(out, start_dim=1)
    print("Flattend output shape: {}".format(out.shape))

    feat_coll.append(out)
    label_coll.append(labels)

out_features = torch.flatten(torch.stack(feat_coll), start_dim=0, end_dim=1)
out_labels = torch.flatten(torch.stack(label_coll), start_dim=0, end_dim=1)

print("The final features matrix has shape: {}".format(out_features.shape))

if save_to_disk:
    # save as TensorDataset
    out_dataset = TensorDataset(out_features, out_labels)
    if train:
        prefix = "train"
    else:
        prefix = "test"
    filename = "{}_{}_dataset.pt".format(prefix, scattering.__class__.__name__)
    torch.save(out_dataset, filename)
    print("Saved features at {}/{}".format(os.getcwd(), filename))

In [None]:
def get_stored_dataset(filename, train=True): 
    loaded_dataset = torch.load(filename)
    features = loaded_dataset[:][0]
    labels = loaded_dataset[:][1]

    return features, labels

In [None]:
filename = "train_HarmonicScatteringTorch3D_dataset.pt"
out_features, out_labels = get_stored_dataset(filename)

### Classification with XGBoost

In [None]:
import sklearn
print(sklearn.__version__)

In [None]:
import xgboost as xgb
import pickle
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, mean_squared_error

from math import sqrt

In [None]:
labels = out_labels.numpy()
features = out_features.numpy()
features.shape, labels.shape

In [None]:
_, counts = np.unique(labels, return_counts=True)

In [None]:
pos_label_counts = sum([1 for i in range(len(labels)) if labels[i] == 1])
neg_label_counts = sum([1 for i in range(len(labels)) if labels[i] == 0])

In [None]:
scale_pos_weights = neg_label_counts/pos_label_counts
scale_pos_weights

In [None]:
"""params = {}
params["objective"] = "binary:logistic"
params["verbosity"] = 2
params["gamma"] = 10
params["max_depth"] = 4
params["subsample"] = 0.5
params["colsample_bytree"] = 0.5
params["colsample_bylevel"] = 0.5
params["colsample_bynode"] = 0.5
params["tree_method"] = "gpu_hist"
params["scale_pos_weight"] = scale_pos_weights
"""

Scikit learn APOI

In [None]:
xgb_model = xgb.XGBClassifier(
    max_depth = 3, 
    learning_rate = 0.1, 
    n_estimators = 100, 
    verbosity = 2, 
    objective = "binary:logistic",
    booster = "gbtree",
    tree_method = "gpu_hist",
    gamma = 10,
    subsample = 0.5,
    colsample_bytree = 0.5,
    colsample_bylevel = 0.5,
    colsample_bynode = 0.5,
    scale_pos_weight = scale_pos_weights
)

In [None]:
trained_model = xgb_model.fit(features, labels)

# Transfer Learning

In [None]:
from fastai import *
from fastai.callback import *
from fastai.vision.all import *

In [None]:
data = ImageDataLoaders.from_folder("/home/sutd/Documents/Workplace/DLCourse/MATH6380P/semi/train/train_contest", item_tfms=Resize(260), valid_pct=0.2, batch_tfms=Normalize.from_stats(*imagenet_stats))

In [None]:
imagenet_stats

In [None]:
data.show_batch(nrows=3)

In [None]:
learner = cnn_learner(
    data, 
    models.resnet34, 
    metrics=[error_rate]
)

In [None]:

# learner.fine_tune(1)

learner.lr_find()
learner.fine_tune(2, 3e-3)

learner.save('test_model')

interp = ClassificationInterpretation.from_learner(learner)

interp.plot_confusion_matrix()

In [None]:
interp.plot_top_losses(10, figsize=(20,20))

In [None]:
data_new = ImageDataLoaders.from_folder("/home/sutd/Documents/Workplace/DLCourse/MATH6380P/semi/train/train_contest", item_tfms=Resize(260), valid_pct=0.2, batch_tfms=Normalize.from_stats(*imagenet_stats))

data_new

In [None]:
data_new.train_ds

In [None]:
labels = []
for img,target in data_new.train_ds:
    labels.append(target)

In [None]:
print(len(labels))
labels = torch.stack(labels)
plt.hist(labels)

In [None]:
labels

In [None]:
from torch.utils.data.sampler import WeightedRandomSampler

In [None]:
# total number of items in the dataset for each label?
"""
train_labels = data_new.train_dl.dataset.y.items
_, counts    = np.unique(train_labels,return_counts=True)

print(counts, train_labels)
type(counts)

1./counts

class_weights = 1./counts
weights       = class_weights[train_labels]

type(class_weights)

label_counts = np.bincount([data_new.train_dl.dataset.y[i].data for i in range(len(data_new.train_dl.dataset))])

label_counts

total_len_oversample = int(data_new.c*np.max(label_counts)) #WHY????

print(type(total_len_oversample), total_len_oversample)
"""
accuracy

In [None]:
learn = cnn_learner(
    data_new, 
    models.resnet18, 
    metrics=[error_rate]
)

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(5)

In [None]:
learn.unfreeze()
learn.lr_find()

In [None]:
learn.fit(25, 1e-6)

In [None]:
learn.export("resnet18_25ep_oversampling_no_transforms.pkl")

In [None]:
learn.path

In [None]:
interp = ClassificationInterpretation.from_learner(learn)

interp.plot_confusion_matrix()

In [None]:
interp.plot_top_losses(9)

In [None]:
learn.lr_find()

In [None]:
learn.fit_one_cycle(10,2.5e-03)

In [None]:
interp = ClassificationInterpretation.from_learner(learn)

interp.plot_confusion_matrix()
interp.plot_top_losses(10, figsize=(20,20))

In [None]:
test_data = ImageDataBunch.from_folder("/home/sutd/Documents/Workplace/DLCourse/MATH6380P/semi/test/test_contest", test="test")

In [None]:
test_data

In [None]:
learn

In [None]:
tfms = get_transforms(
    do_flip = True, 
    flip_vert = True, 
    max_rotate = None,
    max_warp = None
)

In [None]:
len(tfms)

In [None]:
tfms

In [None]:
def get_ex(): return open_image('/home/sutd/Documents/Workplace/DLCourse/MATH6380P/semi/train/train_contest/good_all/WEA938001D1A_10-5CW-ITISA49-1_78_2.bmp')

def plots_f(rows, cols, width, height, **kwargs):
    [get_ex().apply_tfms(tfms[0], **kwargs).show(ax=ax) for i,ax in enumerate(plt.subplots(
        rows,cols,figsize=(width,height))[1].flatten())]


In [None]:
os.getcwd()

In [None]:
plots_f(2, 4, 12, 6, size=224)

In [None]:
data_transformed = ImageDataBunch.from_folder("/home/sutd/Documents/Workplace/DLCourse/MATH6380P/semi/train/train_contest", 
                                              valid_pct=0.2, ds_tfms = tfms).normalize(imagenet_stats)


In [None]:
learn2 = cnn_learner(
    data_transformed, 
    models.resnet18, 
    metrics=[error_rate], 
    opt_func = optim.Adam
)

In [None]:
callbacks = [
    callbacks.OverSamplingCallback(learn2),
    callbacks.ReduceLROnPlateauCallback(learn2, monitor="valid_loss", mode="auto", patience=3, min_delta=0.01),
    callbacks.SaveModelCallback(
        learn2, 
        monitor="error_rate", 
        mode="min", 
        every="improvement", 
        name="transformed_best"
    )
]

In [None]:
learn2.lr_find()
learn2.recorder.plot(suggestion=True)

In [None]:
learn2.fit_one_cycle(5, max_lr=slice(3e-5, 4e-3), callbacks=callbacks)

In [None]:
learn2.unfreeze()
learn2.lr_find()
learn2.recorder.plot(suggestion=True)

In [None]:
learn2.fit(5, slice(3e-6, 6e-5), callbacks=callbacks)

In [None]:
callbacks = [
    OverSamplingCallback(learn2),
    ReduceLROnPlateauCallback(learn2, monitor="valid_loss", mode="auto", patience=3, min_delta=0.01),
    SaveModelCallback(
        learn2, 
        monitor="error_rate", 
        mode="min", 
        every="improvement", 
        name="tsfmd_best_attempt_2"
    )
]

In [None]:
learn2.fit(5, slice(3e-6, 6e-5), callbacks=callbacks)

In [None]:
callbacks = [
    OverSamplingCallback(learn2),
    ReduceLROnPlateauCallback(learn2, monitor="valid_loss", mode="auto", patience=3, min_delta=0.01),
    SaveModelCallback(
        learn2, 
        monitor="error_rate", 
        mode="min", 
        every="epoch", 
        name="tsfmd_best_attempt_3"
    )
]

In [None]:
learn2.fit(5, slice(3e-6, 6e-5), callbacks=callbacks)

In [None]:
callbacks = [
    OverSamplingCallback(learn2),
    ReduceLROnPlateauCallback(learn2, monitor="error_rate", mode="auto", patience=2, min_delta=0.001),
    SaveModelCallback(
        learn2, 
        monitor="error_rate", 
        mode="min", 
        every="epoch", 
        name="tsfmd_best_attempt_4"
    )
]

In [None]:
learn2.fit(5, slice(3e-6, 6e-5), callbacks=callbacks)

In [None]:
data_transformed.train_dl.dl.num_workers

In [None]:
callbacks = [
    OverSamplingCallback(learn2),
    ReduceLROnPlateauCallback(learn2, monitor="error_rate", mode="auto", patience=2, min_delta=0.001),
    SaveModelCallback(
        learn2, 
        monitor="error_rate", 
        mode="min", 
        every="epoch", 
        name="tsfmd_best_attempt_5"
    ),
    ShowGraph(learn2)
]

In [None]:
learn2.load("tsfmd_best_attempt_5_0")

In [None]:
callbacks = [
    OverSamplingCallback(learn2),
    ReduceLROnPlateauCallback(learn2, monitor="error_rate", mode="auto", patience=2, min_delta=0.001),
    SaveModelCallback(
        learn2, 
        monitor="error_rate", 
        mode="min", 
        every="epoch", 
        name="tsfmd_best_attempt_6"
    ),
    ShowGraph(learn2)
]

In [None]:
learn2.fit(5, slice(1e-6, 6e-5), callbacks=callbacks)

In [None]:
callbacks = [
    OverSamplingCallback(learn2),
    ReduceLROnPlateauCallback(learn2, monitor="valid_loss", mode="auto", patience=2, min_delta=0.001),
    SaveModelCallback(
        learn2, 
        monitor="error_rate", 
        mode="min", 
        every="epoch", 
        name="tsfmd_best_attempt_7"
    ),
    ShowGraph(learn2)
]

In [None]:
learn2.fit(5, slice(1e-6, 6e-5), callbacks=callbacks)

In [None]:
callbacks = [
    OverSamplingCallback(learn2),
    ReduceLROnPlateauCallback(learn2, monitor="valid_loss", mode="auto", patience=2, min_delta=0.001),
    SaveModelCallback(
        learn2, 
        monitor="error_rate", 
        mode="min", 
        every="epoch", 
        name="tsfmd_best_attempt_8"
    ),
    ShowGraph(learn2)
]

In [None]:
learn2.fit(5, slice(1e-6, 6e-5), callbacks=callbacks)

In [None]:
callbacks = [
    OverSamplingCallback(learn2),
    ReduceLROnPlateauCallback(learn2, monitor="valid_loss", mode="auto", patience=2, min_delta=0.001),
    SaveModelCallback(
        learn2, 
        monitor="error_rate", 
        mode="min", 
        every="epoch", 
        name="tsfmd_best_attempt_9"
    ),
    ShowGraph(learn2)
]

In [None]:
learn2.fit(5, slice(1e-6, 6e-5), callbacks=callbacks)

In [None]:
callbacks = [
    OverSamplingCallback(learn2),
    ReduceLROnPlateauCallback(learn2, monitor="valid_loss", mode="auto", patience=2, min_delta=0.001),
    SaveModelCallback(
        learn2, 
        monitor="error_rate", 
        mode="min", 
        every="epoch", 
        name="tsfmd_best_attempt_11"
    ),
    ShowGraph(learn2)
]

In [None]:
learn2.fit(10, slice(1e-6, 6e-5), callbacks=callbacks)

In [None]:
learn2 = cnn_learner(
    data_transformed, 
    models.resnet18, 
    metrics=[error_rate, AUROC()], 
    opt_func = optim.Adam
)

In [None]:
learn2.load("/home/sutd/Documents/Workplace/DLCourse/MATH6380P/project_2/models/tsfmd_best_attempt_9_1")

In [None]:
callbacks = [
    OverSamplingCallback(learn2),
    ReduceLROnPlateauCallback(learn2, monitor="valid_loss", mode="auto", patience=2, min_delta=0.001),
    SaveModelCallback(
        learn2, 
        monitor="error_rate", 
        mode="min", 
        every="epoch", 
        name="tsfmd_best_w_auc_1"
    ),
    ShowGraph(learn2),
    AUROC()
]

In [None]:
learn2.fit(5, slice(1e-6, 6e-5), callbacks=callbacks)

In [None]:
preds = learn2.TTA()
preds[0][:5, :].numpy()

In [None]:
blah = preds[0][:10, :].numpy(); blah

In [None]:
result = np.argmax(blah, axis=1); print(result)

In [None]:
data_transformed.train_ds.x[4001]

In [None]:
learn2.predict(data_transformed.train_ds.x[1001])

In [None]:
data_transformed.train_dl.dataset.items

In [None]:
interp2 = ClassificationInterpretation.from_learner(learn2)

interp2.plot_confusion_matrix()

In [None]:
interp2.plot_top_losses(9, heatmap=True)

## Train on entire dataset

In [None]:
train_data = ImageDataLoaders.from_folder("/home/sutd/Documents/Workplace/DLCourse/MATH6380P/semi", train = "train", test = "test",
                                        item_tfms=Resize(260), valid_pct=0.0001, batch_tfms=Normalize.from_stats(*imagenet_stats))

In [None]:
learn_full = cnn_learner(
    train_data, 
    models.resnet18, 
    metrics=[error_rate]
)

In [None]:
learn_full.lr_find()
learn_full.fit_one_cycle(5)

In [None]:
learn_full.unfreeze()
learn_full.lr_find()

In [None]:
learn_full.fit(25, 1e-6)

In [None]:
learn_full.freeze()

In [None]:
learn_full.summary()

In [None]:
learn_full.fit(5, slice(1e-6, 6e-5))

In [None]:
filename = "final_model.pt"
torch.save(learn_full, filename)

In [None]:
learn_full.save('final')

# Now to test!

In [None]:
test_data = ImageDataLoaders.from_folder("/home/sutd/Documents/Workplace/DLCourse/MATH6380P/semi/test/test_contest/test", 
                                        item_tfms=Resize(260), valid_pct=0.0001, batch_tfms=Normalize.from_stats(*imagenet_stats))


In [None]:
probs, labels = learn_full.tta(ds_type=DatasetType.Test, scale = 1.1)

In [None]:
probs.shape

In [None]:
preds = np.argmax(probs, axis=1)

In [None]:
preds.shape

In [None]:
train_data.test_ds.x[-100]

In [None]:
paths = list(map(lambda x: str(x).split("/")[-1], train_data.test_ds.x.items))

In [None]:
filenames =  list(map(lambda x: x.split(".")[0], paths))

In [None]:
zip(filenames, preds.numpy())

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(list(zip(filenames, preds.numpy())), columns = ["id", "score"])

In [None]:
df.head()

In [None]:
df.to_csv("/home/sutd/Documents/Workplace/DLCourse/MATH6380P/project_2/submission.csv", index=False)