In [None]:
#@title Import Statements
import pandas as pd
import sys, os
import matplotlib.image as mpimg 
import matplotlib.pyplot as plt
import numpy as np
from collections import OrderedDict 
import torch
import torch.nn as nn
import torch.optim as optim
from PIL import Image
import warnings
from torch.utils.data import DataLoader,Dataset
import torch.nn.functional as F
from torchvision import models,transforms
import zipfile

# Data Retrieval


In [None]:
# get the dataset 
!gdown --id 1QmM4n8K24kAAqnmIcDl7tMWSAmBJaLcN

Downloading...
From: https://drive.google.com/uc?id=1QmM4n8K24kAAqnmIcDl7tMWSAmBJaLcN
To: /content/HAMDATA.zip
100% 2.77G/2.77G [00:14<00:00, 185MB/s]


In [None]:
directory = "ham_data"
# create a directory on this vm
new_directory_path = os.path.join(os.getcwd(), directory)
os.mkdir(new_directory_path)
#extract the dataset into this new directory
with zipfile.ZipFile('HAMDATA.zip', 'r') as zip_ref:
    zip_ref.extractall(new_directory_path)

In [None]:
datapath = os.getcwd() + "/ham_data/data"
# a number less than 1 
#change the fraction of data sampled to help memory and run time
# at 1 has a 3-4 hour run time
sample_fraction = 1 

# DATA AND FEATURE MANIPULATION AND FORMATTING SECTION


In [None]:
#underlying condtion possible values will be classification output class
#the first three are not deadly the last 4 are
# dx in dataframe and dataset
outputClasses = np.array(['df','bkl','nv', 'vasc', 'mel', 'akiec', 'bcc'])

In [None]:
#gets a dataframe with a sample of the full dataset
def getSampleOfData(sample_fraction):
    #Get MetaData in dataframe
    fullhamdf = pd.read_csv(datapath + '/HAM10000_metadata.csv')
    #drop the useless lesion id and the diagnosis validation type information after dropping duplicates in dataset
    fullhamdf = fullhamdf.drop_duplicates(subset='lesion_id', keep='first').drop(["lesion_id", "dx_type"], axis=1).dropna()
    #Random Sample for run time
    return fullhamdf.sample(frac=sample_fraction, random_state=1)

In [None]:
# gets a dataframe with a sample of the dataset with all features in numerical or vector format
def getHamDataFrame(sample_fraction):
    #gets a dataframe with a sample of the dataset
    hamdf = getSampleOfData(sample_fraction)
    
    # used to make male be 1 female anything else 0
    def binarySex(sex):
        return int(sex == "male")
    hamdf["sex"] = hamdf["sex"].map(binarySex)
    hamdf.head
    locClasses = hamdf["localization"].unique()

    #used to get diffrent locations as diffrent binary classes
    def binaryLoc(location):
        return int(curloc == location)
    for loc in locClasses:
        hamdf[loc] = np.zeros(len(hamdf))
        curloc = loc 
        hamdf[loc] = hamdf["localization"].map(binaryLoc)

    #used to get an compressed image from data as a vector
    def getHamImageAsVector(image_id):
        img = Image.open(datapath + "/HAM10000_images/" + image_id +".jpg")
        return np.array(img.resize((32,32),Image.ANTIALIAS).getdata())
    hamdf["image_data"] = hamdf["image_id"].map(getHamImageAsVector)

    #get output classes as numerical values
    def getNumericalOutputClass(dx) :
        return np.where(outputClasses == dx)[0][0]
    hamdf["dx"] = hamdf["dx"].map(getNumericalOutputClass)

    #normalize age
    hamdf["age"]=(hamdf["age"]-hamdf["age"].min())/(hamdf["age"].max()-hamdf["age"].min())

    hamdf = hamdf.drop(["localization", "image_id"], axis=1)    
    return hamdf

In [None]:
#Gets a dataframe with ouly the path to the image and the output class dx
def getHamDataFrameWithImagePathOnly(sample_fraction):
    #gets a dataframe with a sample of the dataset
    hamdf = getSampleOfData(sample_fraction)[["dx","image_id"]]
    # get path to each image
    def getImagePath(image_id) :
        return datapath + "/HAM10000_images/" + image_id +".jpg"
    hamdf["image_id"] = hamdf["image_id"].map(getImagePath)
    #get output classes as numerical values
    def getNumericalOutputClass(dx) :
        return np.where(outputClasses == dx)[0][0]
    hamdf["dx"] = hamdf["dx"].map(getNumericalOutputClass)
    return hamdf
    

In [None]:
# mutates and returns a hamdf with more equal amount of data points in each output class
import random 
def getMoreEqualSample(hamdf):
    def setLargestValuesNA(value):
        if(value == largestvalue and random.randint(0, 3) < 1):
            return None
        return value
    while(hamdf["dx"].value_counts().nsmallest(1)[0] * 6 <  hamdf["dx"].value_counts().nlargest(1).reset_index().iloc(0)[0][1]) :
        largestvalue = hamdf["dx"].value_counts().index[0]
        hamdf["dx"] = hamdf["dx"].map(setLargestValuesNA)
        hamdf = hamdf.dropna()
    return hamdf.dropna()

In [None]:
# adds color variance and color mean features to data frame 
# Mutates dataframe does not return a new one
def addColorFeatures(hamdf):
    def getColorValue(image_data):
        return np.array(image_data).T[currentColor]
    currentColor = 0
    hamdf["red_values"] = hamdf["image_data"].map(getColorValue)
    currentColor = 1
    hamdf["green_values"] = hamdf["image_data"].map(getColorValue)
    currentColor = 2
    hamdf["blue_values"] = hamdf["image_data"].map(getColorValue)
    #now creating custom features 
    #get Variance of all colors
    def getColorVariance(color_data):
        return np.var(color_data)
    hamdf["red_var"] = hamdf["red_values"].map(getColorVariance)
    hamdf["green_var"] = hamdf["green_values"].map(getColorVariance)
    hamdf["blue_var"] = hamdf["blue_values"].map(getColorVariance)

    #get Mean of all Colors
    def getColorMean(color_data):
        return np.mean(color_data)
    hamdf["red_mean"] = hamdf["red_values"].map(getColorMean)
    hamdf["green_mean"] = hamdf["green_values"].map(getColorMean)
    hamdf["blue_mean"] = hamdf["blue_values"].map(getColorMean)
    # remove repeated values
    hamdf = hamdf.drop(["red_values", "green_values", "blue_values"], axis=1) 
    return hamdf

In [None]:
# turns dx the output classes into binary classes based on whether the underlying condition is deadly 
# Mutates dataframe does not return a new one
def getBinaryOutputClasses(hamdf):
    # change to binary output
    def binaryOuput(dx):
        return int(dx > 3)
    hamdf["dx"] = hamdf["dx"].map(binaryOuput)
    return hamdf

In [None]:
# get train, test and validation set split (already in random order from sample)
def getTrainTestVal(df) :    
    train_set = df[:int(len(df)*.7)].reset_index()
    test_set = df[int(len(df)*.7):int(len(df)*.9)].reset_index()
    validation_set = df[int(len(df)*.9):].reset_index()
    return train_set, test_set, validation_set

In [None]:
#used to get a input vector [age, sex, location class,imagedata] and output vector [underlying condtion class] from of a row of the above dataframe 
def getInputNOutput(item):
    x = []
    x.append(int(item["age"]))
    # 1 is male 0 is female
    x.append(int(item["sex"] == "male"))
    x = np.array(x)
    #adds the location class
    x = np.concatenate([x,getClassVector(item["localization"], locClasses)])
    #adds image classes
    imgdata = np.concatenate([x,getHamImageAsVector(item["image_id"])])
    #get classification output class which is the underlying condtion 
    y = getClassVector(item["dx"], outputClasses)
    return x, y

In [None]:
# returns data set with input features as a flat vector and with the output as a scalar 
def getDataSets( df):
    x = []
    y = []
    for index, row in df.iterrows():
        row = np.array(row)
        xi = []
        for i in row[2:]:
            xi = np.hstack([xi,np.array([i]).flatten()])
        x.append(xi)
        #print(row)
        y.append([row[1]])
    return np.array(x), np.array(y)

# LOGISTIC REGRESSION AND SUPPORT VECTOR MACHINES SECTION

In [None]:
#change the sample number input to help memory and run time
hamdf = getHamDataFrame(sample_fraction)
hamdf.head()

Unnamed: 0,dx,age,sex,neck,lower extremity,back,trunk,abdomen,face,upper extremity,foot,genital,unknown,scalp,chest,hand,ear,acral,image_data
2763,6,0.529412,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"[[183, 137, 136], [190, 148, 153], [212, 174, ..."
8103,2,0.294118,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,"[[174, 137, 147], [175, 136, 140], [176, 137, ..."
9388,2,0.588235,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,"[[224, 212, 221], [225, 209, 220], [226, 210, ..."
3148,2,0.588235,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,"[[226, 153, 162], [228, 153, 168], [227, 150, ..."
5741,2,0.352941,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,"[[191, 112, 124], [188, 112, 130], [171, 99, 1..."


In [None]:
allFeatures = hamdf.columns
allFeatures

Index(['dx', 'age', 'sex', 'neck', 'lower extremity', 'back', 'trunk',
       'abdomen', 'face', 'upper extremity', 'foot', 'genital', 'unknown',
       'scalp', 'chest', 'hand', 'ear', 'acral', 'image_data'],
      dtype='object')

In [None]:
# counts of the output classes
hamdf["dx"].value_counts()

2    5361
1     718
4     613
6     327
5     228
3      98
0      73
Name: dx, dtype: int64

In [None]:
#use this to det rid of warinings on the fact that it always wants more iterations for convergence 
warnings.filterwarnings("ignore")

In [None]:
# gets a new dataframe that is less skewed towards any one class
less_skewed_hamdf = getMoreEqualSample(hamdf.copy())

In [None]:
less_skewed_hamdf["dx"].value_counts()

1.0    410
2.0    396
4.0    335
6.0    327
5.0    228
3.0     98
0.0     73
Name: dx, dtype: int64

In [None]:
#returns classification error
def getCError(data, predictions):
    correct = 0
    for i in range(len(data[0])):
        correct = correct + int(predictions[i] == data[1][i])
    return (1 - (correct / len(data[0]))) * 100

In [None]:
#Logistic Regression Training Testing
from sklearn.linear_model import LogisticRegression
def LogisticRegressionTrainTest(train_data,test_data,val_data):
    regularization_strengths = [.1,1,10,100]
    best_error = 101
    best_model = ""
    for regularization_strength in regularization_strengths:
        model = LogisticRegression(random_state=0,C=regularization_strength,solver='lbfgs').fit(train_data[0], train_data[1].ravel())
        error = getCError(test_data, model.predict(test_data[0]))
        if(error < best_error) :
            best_error = error
            best_model  = model
    print("classification error for Logistic Regression training set = ",getCError(train_data, model.predict(train_data[0])), "%")
    print("classification error for Logistic Regression test set = ",getCError(test_data, model.predict(test_data[0])), "%")
    print("classification error for Logistic Regression validation set = ",getCError(val_data, model.predict(val_data[0])), "%")

In [None]:
#Linear Support Vector Machine Training and Testing
from sklearn.svm import LinearSVC
def LinearSVMTrainTest(train_data,test_data,val_data):
    regularization_strengths = [.1,1,10,100]
    best_error = 101
    for regularization_strength in regularization_strengths:
    
        model = LinearSVC(random_state=0, C=regularization_strength).fit(train_data[0], train_data[1].ravel())
        error = getCError(test_data, model.predict(test_data[0]))
        if(error < best_error) :
            best_error = error
            best_model  = model
    print("classification error for Linear SVM training set = ",getCError(train_data, model.predict(train_data[0])), "%")
    print("classification error for Linear SVM test set = ",getCError(test_data, model.predict(test_data[0])), "%")
    print("classification error for Linear SVM validation set = ",getCError(val_data, model.predict(val_data[0])), "%")

In [None]:
#General Support Vector Machine Training and Testing
from sklearn.svm import SVC
def GeneralSVMTrainTest(train_data,test_data,val_data):
    kernels = ["poly", "rbf", "sigmoid"]
    best_error = 101
    best_model = ""
    for kernel in kernels:
 
        model = SVC(C=10.0, kernel=kernel, random_state=0, shrinking=True).fit(train_data[0], train_data[1].ravel())
        error = getCError(test_data, model.predict(test_data[0]))
        if(error < best_error) :
            best_error = error
            best_model  = model
    
    print("classification error for General SVM training set = ",getCError(train_data, best_model.predict(train_data[0])), "%")
    print("classification error for General SVM test set = ",getCError(test_data, best_model.predict(test_data[0])), "%")
    print("classification error for General SVM validation_set = ",getCError(val_data, best_model.predict(val_data[0])), "%")

In [None]:
#trains and tests Logistic regression and Support Vector Machines with diffrent hyperparameters 
def trainTestLRSVM(hamdf):
    train_set, test_set, validation_set = getTrainTestVal(hamdf)
    train_data = getDataSets(train_set)
    test_data = getDataSets(test_set)
    val_data = getDataSets(validation_set)
    
    LogisticRegressionTrainTest(train_data,test_data,val_data)
    LinearSVMTrainTest(train_data,test_data,val_data)
    GeneralSVMTrainTest(train_data,test_data,val_data)

In [None]:
# this section is just to show how  the logistic regression and SVM do without any feature manipulation
trainTestLRSVM(hamdf)

classification error for Logistic Regression training set =  23.998459167950692 %
classification error for Logistic Regression test set =  26.07816711590296 %
classification error for Logistic Regression validation set =  28.706199460916437 %
classification error for Linear SVM training set =  8.532357473035434 %
classification error for Linear SVM test set =  28.4366576819407 %
classification error for Linear SVM validation set =  27.49326145552561 %
classification error for General SVM training set =  5.816640986132516 %
classification error for General SVM test set =  21.832884097035045 %
classification error for General SVM validation_set =  23.180592991913752 %


In [None]:
#now do the same but with data less skewed
trainTestLRSVM(less_skewed_hamdf)

classification error for Logistic Regression training set =  39.66309341500766 %
classification error for Logistic Regression test set =  57.2192513368984 %
classification error for Logistic Regression validation set =  45.98930481283422 %
classification error for Linear SVM training set =  14.548238897396626 %
classification error for Linear SVM test set =  63.36898395721925 %
classification error for Linear SVM validation set =  51.33689839572193 %
classification error for General SVM training set =  4.670750382848388 %
classification error for General SVM test set =  45.98930481283422 %
classification error for General SVM validation_set =  45.45454545454546 %


In [None]:
# adds the color features to the dfs
hamdf = addColorFeatures(hamdf)
less_skewed_hamdf = addColorFeatures(less_skewed_hamdf)
hamdf.head()

Unnamed: 0,dx,age,sex,neck,lower extremity,back,trunk,abdomen,face,upper extremity,foot,genital,unknown,scalp,chest,hand,ear,acral,image_data,red_var,green_var,blue_var,red_mean,green_mean,blue_mean
2763,6,0.529412,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"[[183, 137, 136], [190, 148, 153], [212, 174, ...",313.420837,824.873775,855.493453,212.304688,170.314453,174.678711
8103,2,0.294118,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,"[[174, 137, 147], [175, 136, 140], [176, 137, ...",602.308722,820.310303,1083.954575,170.864258,128.390625,126.183594
9388,2,0.588235,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,"[[224, 212, 221], [225, 209, 220], [226, 210, ...",100.803188,498.742942,949.542502,228.869141,195.668945,215.206055
3148,2,0.588235,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,"[[226, 153, 162], [228, 153, 168], [227, 150, ...",663.335934,909.795653,1057.270019,214.498047,140.874023,144.749023
5741,2,0.352941,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,"[[191, 112, 124], [188, 112, 130], [171, 99, 1...",823.85643,1499.055603,1894.978848,182.621094,107.570312,116.275391


In [None]:
trainTestLRSVM(hamdf)

classification error for Logistic Regression training set =  23.497688751926038 %
classification error for Logistic Regression test set =  27.156334231805936 %
classification error for Logistic Regression validation set =  27.223719676549862 %
classification error for Linear SVM training set =  8.26271186440678 %
classification error for Linear SVM test set =  31.940700808625333 %
classification error for Linear SVM validation set =  29.64959568733153 %
classification error for General SVM training set =  8.378274268104779 %
classification error for General SVM test set =  22.169811320754718 %
classification error for General SVM validation_set =  23.045822102425873 %


In [None]:
#now do the same but with data less skewed
trainTestLRSVM(less_skewed_hamdf)

classification error for Logistic Regression training set =  39.43338437978561 %
classification error for Logistic Regression test set =  54.010695187165766 %
classification error for Logistic Regression validation set =  43.85026737967914 %
classification error for Linear SVM training set =  4.287901990811638 %
classification error for Linear SVM test set =  55.88235294117647 %
classification error for Linear SVM validation set =  50.26737967914438 %
classification error for General SVM training set =  9.800918836140893 %
classification error for General SVM test set =  44.11764705882353 %
classification error for General SVM validation_set =  42.24598930481284 %


In [None]:
# changes dx the class lable to binary based on wether the underlying condtion is deadly
hamdf = getBinaryOutputClasses(hamdf)
less_skewed_hamdf = getBinaryOutputClasses(less_skewed_hamdf)
hamdf.head()

Unnamed: 0,dx,age,sex,neck,lower extremity,back,trunk,abdomen,face,upper extremity,foot,genital,unknown,scalp,chest,hand,ear,acral,image_data,red_var,green_var,blue_var,red_mean,green_mean,blue_mean
2763,1,0.529412,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"[[183, 137, 136], [190, 148, 153], [212, 174, ...",313.420837,824.873775,855.493453,212.304688,170.314453,174.678711
8103,0,0.294118,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,"[[174, 137, 147], [175, 136, 140], [176, 137, ...",602.308722,820.310303,1083.954575,170.864258,128.390625,126.183594
9388,0,0.588235,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,"[[224, 212, 221], [225, 209, 220], [226, 210, ...",100.803188,498.742942,949.542502,228.869141,195.668945,215.206055
3148,0,0.588235,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,"[[226, 153, 162], [228, 153, 168], [227, 150, ...",663.335934,909.795653,1057.270019,214.498047,140.874023,144.749023
5741,0,0.352941,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,"[[191, 112, 124], [188, 112, 130], [171, 99, 1...",823.85643,1499.055603,1894.978848,182.621094,107.570312,116.275391


In [None]:
trainTestLRSVM(hamdf)

classification error for Logistic Regression training set =  12.268875192604 %
classification error for Logistic Regression test set =  16.307277628032345 %
classification error for Logistic Regression validation set =  17.38544474393531 %
classification error for Linear SVM training set =  8.763482280431434 %
classification error for Linear SVM test set =  17.31805929919138 %
classification error for Linear SVM validation set =  16.442048517520213 %
classification error for General SVM training set =  4.449152542372881 %
classification error for General SVM test set =  13.477088948787063 %
classification error for General SVM validation_set =  14.690026954177892 %


In [None]:
#now do the same but with data less skewed
trainTestLRSVM(less_skewed_hamdf)

classification error for Logistic Regression training set =  21.286370597243486 %
classification error for Logistic Regression test set =  37.700534759358284 %
classification error for Logistic Regression validation set =  36.36363636363637 %
classification error for Linear SVM training set =  12.55742725880551 %
classification error for Linear SVM test set =  39.037433155080215 %
classification error for Linear SVM validation set =  37.96791443850267 %
classification error for General SVM training set =  6.738131699846861 %
classification error for General SVM test set =  29.14438502673797 %
classification error for General SVM validation_set =  26.7379679144385 %


# BINARY SIMPLE NEURAL NETS SECTION


In [None]:
# gest classification of Binary 
def getAccNPredictionsNN(x,y, model):
    predictions = model(x)
    points  = []
    totalRight = 0
    for i in range(len(predictions)):
        if(predictions[i] < 0.5) :
            points.append(np.append(x[i].numpy(), [y[i], -1]))
            if(y[i] < 0.5):
                totalRight = totalRight + 1
        else:
            points.append(np.append(x[i].numpy(), [y[i], 1]))
            if(y[i] > 0.5):
                totalRight = totalRight + 1
    print("classification error = ",(1 - totalRight/len(predictions)) * 100,"%")

In [None]:
def feedForwardNeuralNetwork(numLayers,numNodes,activationFunction,x,y) :
    model = getNeuralNetwork(numLayers,numNodes,activationFunction)
    model = model.double()
    loss_fn = torch.nn.BCEWithLogitsLoss()
    learning_rate = 1e-4
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    ##out.backward(torch.randn(1, 10))
    for i in range(1000) :
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        #if i % 100 == 99:
        #    print(i, loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step() 
    return model

In [None]:
def getNeuralNetwork(numLayers,numNodes,activationFunction):
    od = OrderedDict() 
    for i in range(len(numNodes) -1):
        od['linear' + str(i)] = torch.nn.Linear(numNodes[i], numNodes[i+1])
        if(i < len(numNodes) - 2):
            if(activationFunction == "relu"):
                od['rel ru' + str(i)] = torch.nn.ReLU()
            if(activationFunction == "sig"):
                od['sig' + str(i)] = torch.nn.Sigmoid()
            if(activationFunction == "tanh"):
                od['Tanh' + str(i)] = torch.nn.Tanh()
            if(activationFunction == "identity activation"):
                #do nothing here 
                fun = "this class"
    return nn.Sequential(od)

In [None]:
def trainTestPlotNN(test_data,train_data, hiddenLayerNumNodes, numHiddenLayers, activationFunction):
    
    inputNum = len(train_data[0][0])
    outputClassesNum = 1
    numNodes = []
    numNodes.append(inputNum)
    
    for i in range(numHiddenLayers):
        numNodes.append(hiddenLayerNumNodes)
    numNodes.append(outputClassesNum)
    train_y = []
    test_y = []
    for yi in train_data[1]:
        train_y.append(np.array(yi))
    for yi in test_data[1]:
        test_y.append(np.array(yi))    
    test_y = np.array(test_y)
    train_y = np.array(train_y)
    
    train_x = torch.from_numpy(train_data[0].astype(float))
    train_y = torch.from_numpy(train_y.astype(float))
    test_x = torch.from_numpy(test_data[0].astype(float))
    test_y = torch.from_numpy(test_y.astype(float))
    
    model = feedForwardNeuralNetwork(numHiddenLayers + 2, numNodes ,activationFunction,train_x,train_y)
    
    print("Training Dataset -")
    getAccNPredictionsNN(train_x,train_y, model)
    print("Test Dataset -")
    getAccNPredictionsNN(test_x,test_y, model)
    return model

In [None]:
# do Binary training with diffrent hyper parameters the NN are all simple fully connected NN
train_set, test_set, validation_set = getTrainTestVal(hamdf)
train_data = getDataSets(train_set)
test_data = getDataSets(test_set)

In [None]:
trainTestPlotNN(test_data,train_data, 10, 1, "relu")

Training Dataset -
classification error =  15.446841294298919 %
Test Dataset -
classification error =  16.0377358490566 %


Sequential(
  (linear0): Linear(in_features=3095, out_features=10, bias=True)
  (rel ru0): ReLU()
  (linear1): Linear(in_features=10, out_features=1, bias=True)
)

In [None]:
trainTestPlotNN(test_data,train_data, 10, 2, "relu")

Training Dataset -
classification error =  14.714946070878277 %
Test Dataset -
classification error =  15.566037735849058 %


Sequential(
  (linear0): Linear(in_features=3095, out_features=10, bias=True)
  (rel ru0): ReLU()
  (linear1): Linear(in_features=10, out_features=10, bias=True)
  (rel ru1): ReLU()
  (linear2): Linear(in_features=10, out_features=1, bias=True)
)

In [None]:
trainTestPlotNN(test_data,train_data, 10, 5, "relu")

Training Dataset -
classification error =  15.446841294298919 %
Test Dataset -
classification error =  16.0377358490566 %


Sequential(
  (linear0): Linear(in_features=3095, out_features=10, bias=True)
  (rel ru0): ReLU()
  (linear1): Linear(in_features=10, out_features=10, bias=True)
  (rel ru1): ReLU()
  (linear2): Linear(in_features=10, out_features=10, bias=True)
  (rel ru2): ReLU()
  (linear3): Linear(in_features=10, out_features=10, bias=True)
  (rel ru3): ReLU()
  (linear4): Linear(in_features=10, out_features=10, bias=True)
  (rel ru4): ReLU()
  (linear5): Linear(in_features=10, out_features=1, bias=True)
)

In [None]:
trainTestPlotNN(test_data,train_data, 100, 1, "relu")

Training Dataset -
classification error =  12.731124807396 %
Test Dataset -
classification error =  15.970350404312672 %


Sequential(
  (linear0): Linear(in_features=3095, out_features=100, bias=True)
  (rel ru0): ReLU()
  (linear1): Linear(in_features=100, out_features=1, bias=True)
)

In [None]:
trainTestPlotNN(test_data,train_data, 100, 2, "relu")

Training Dataset -
classification error =  13.328197226502315 %
Test Dataset -
classification error =  14.690026954177892 %


Sequential(
  (linear0): Linear(in_features=3095, out_features=100, bias=True)
  (rel ru0): ReLU()
  (linear1): Linear(in_features=100, out_features=100, bias=True)
  (rel ru1): ReLU()
  (linear2): Linear(in_features=100, out_features=1, bias=True)
)

In [None]:
trainTestPlotNN(test_data,train_data, 100, 5, "relu")

Training Dataset -
classification error =  8.513097072419107 %
Test Dataset -
classification error =  14.353099730458219 %


Sequential(
  (linear0): Linear(in_features=3095, out_features=100, bias=True)
  (rel ru0): ReLU()
  (linear1): Linear(in_features=100, out_features=100, bias=True)
  (rel ru1): ReLU()
  (linear2): Linear(in_features=100, out_features=100, bias=True)
  (rel ru2): ReLU()
  (linear3): Linear(in_features=100, out_features=100, bias=True)
  (rel ru3): ReLU()
  (linear4): Linear(in_features=100, out_features=100, bias=True)
  (rel ru4): ReLU()
  (linear5): Linear(in_features=100, out_features=1, bias=True)
)

In [None]:
trainTestPlotNN(test_data,train_data, 1000, 1, "relu")

Training Dataset -
classification error =  14.695685670261938 %
Test Dataset -
classification error =  15.970350404312672 %


Sequential(
  (linear0): Linear(in_features=3095, out_features=1000, bias=True)
  (rel ru0): ReLU()
  (linear1): Linear(in_features=1000, out_features=1, bias=True)
)

In [None]:
trainTestPlotNN(test_data,train_data, 1000, 2, "relu")

Training Dataset -
classification error =  10.997688751926038 %
Test Dataset -
classification error =  15.161725067385445 %


Sequential(
  (linear0): Linear(in_features=3095, out_features=1000, bias=True)
  (rel ru0): ReLU()
  (linear1): Linear(in_features=1000, out_features=1000, bias=True)
  (rel ru1): ReLU()
  (linear2): Linear(in_features=1000, out_features=1, bias=True)
)

In [None]:
trainTestPlotNN(test_data,train_data, 1000, 5, "relu")

Training Dataset -
classification error =  5.1617873651771955 %
Test Dataset -
classification error =  14.892183288409699 %


Sequential(
  (linear0): Linear(in_features=3095, out_features=1000, bias=True)
  (rel ru0): ReLU()
  (linear1): Linear(in_features=1000, out_features=1000, bias=True)
  (rel ru1): ReLU()
  (linear2): Linear(in_features=1000, out_features=1000, bias=True)
  (rel ru2): ReLU()
  (linear3): Linear(in_features=1000, out_features=1000, bias=True)
  (rel ru3): ReLU()
  (linear4): Linear(in_features=1000, out_features=1000, bias=True)
  (rel ru4): ReLU()
  (linear5): Linear(in_features=1000, out_features=1, bias=True)
)

In [None]:
#the following hyperparameters had the best result on test data after multiple iterations  
validation_data = getDataSets(validation_set)
trainTestPlotNN(validation_data,train_data, 1000, 2, "relu")

Training Dataset -
classification error =  10.22727272727273 %
Test Dataset -
classification error =  16.57681940700808 %


Sequential(
  (linear0): Linear(in_features=3095, out_features=1000, bias=True)
  (rel ru0): ReLU()
  (linear1): Linear(in_features=1000, out_features=1000, bias=True)
  (rel ru1): ReLU()
  (linear2): Linear(in_features=1000, out_features=1, bias=True)
)

In [None]:
#now do the same but with data less skewed
train_set, test_set, validation_set = getTrainTestVal(less_skewed_hamdf)
train_data = getDataSets(train_set)
test_data = getDataSets(test_set)

In [None]:
trainTestPlotNN(test_data,train_data, 10, 1, "relu")

Training Dataset -
classification error =  29.862174578866775 %
Test Dataset -
classification error =  37.96791443850267 %


Sequential(
  (linear0): Linear(in_features=3095, out_features=10, bias=True)
  (rel ru0): ReLU()
  (linear1): Linear(in_features=10, out_features=1, bias=True)
)

In [None]:
trainTestPlotNN(test_data,train_data, 10, 2, "relu")

Training Dataset -
classification error =  24.11944869831547 %
Test Dataset -
classification error =  34.491978609625676 %


Sequential(
  (linear0): Linear(in_features=3095, out_features=10, bias=True)
  (rel ru0): ReLU()
  (linear1): Linear(in_features=10, out_features=10, bias=True)
  (rel ru1): ReLU()
  (linear2): Linear(in_features=10, out_features=1, bias=True)
)

In [None]:
trainTestPlotNN(test_data,train_data, 10, 5, "relu")

Training Dataset -
classification error =  20.444104134762632 %
Test Dataset -
classification error =  33.68983957219251 %


Sequential(
  (linear0): Linear(in_features=3095, out_features=10, bias=True)
  (rel ru0): ReLU()
  (linear1): Linear(in_features=10, out_features=10, bias=True)
  (rel ru1): ReLU()
  (linear2): Linear(in_features=10, out_features=10, bias=True)
  (rel ru2): ReLU()
  (linear3): Linear(in_features=10, out_features=10, bias=True)
  (rel ru3): ReLU()
  (linear4): Linear(in_features=10, out_features=10, bias=True)
  (rel ru4): ReLU()
  (linear5): Linear(in_features=10, out_features=1, bias=True)
)

In [None]:
trainTestPlotNN(test_data,train_data, 100, 1, "relu")

Training Dataset -
classification error =  29.32618683001531 %
Test Dataset -
classification error =  35.82887700534759 %


Sequential(
  (linear0): Linear(in_features=3095, out_features=100, bias=True)
  (rel ru0): ReLU()
  (linear1): Linear(in_features=100, out_features=1, bias=True)
)

In [None]:
trainTestPlotNN(test_data,train_data, 100, 2, "relu")

Training Dataset -
classification error =  19.601837672281775 %
Test Dataset -
classification error =  36.36363636363637 %


Sequential(
  (linear0): Linear(in_features=3095, out_features=100, bias=True)
  (rel ru0): ReLU()
  (linear1): Linear(in_features=100, out_features=100, bias=True)
  (rel ru1): ReLU()
  (linear2): Linear(in_features=100, out_features=1, bias=True)
)

In [None]:
trainTestPlotNN(test_data,train_data, 100, 5, "relu")

Training Dataset -
classification error =  8.805513016845335 %
Test Dataset -
classification error =  33.42245989304813 %


Sequential(
  (linear0): Linear(in_features=3095, out_features=100, bias=True)
  (rel ru0): ReLU()
  (linear1): Linear(in_features=100, out_features=100, bias=True)
  (rel ru1): ReLU()
  (linear2): Linear(in_features=100, out_features=100, bias=True)
  (rel ru2): ReLU()
  (linear3): Linear(in_features=100, out_features=100, bias=True)
  (rel ru3): ReLU()
  (linear4): Linear(in_features=100, out_features=100, bias=True)
  (rel ru4): ReLU()
  (linear5): Linear(in_features=100, out_features=1, bias=True)
)

In [None]:
trainTestPlotNN(test_data,train_data, 1000, 1, "relu")

Training Dataset -
classification error =  33.154670750382856 %
Test Dataset -
classification error =  40.106951871657756 %


Sequential(
  (linear0): Linear(in_features=3095, out_features=1000, bias=True)
  (rel ru0): ReLU()
  (linear1): Linear(in_features=1000, out_features=1, bias=True)
)

In [None]:
trainTestPlotNN(test_data,train_data, 1000, 2, "relu")

Training Dataset -
classification error =  7.810107197549765 %
Test Dataset -
classification error =  32.62032085561497 %


Sequential(
  (linear0): Linear(in_features=3095, out_features=1000, bias=True)
  (rel ru0): ReLU()
  (linear1): Linear(in_features=1000, out_features=1000, bias=True)
  (rel ru1): ReLU()
  (linear2): Linear(in_features=1000, out_features=1, bias=True)
)

In [None]:
trainTestPlotNN(test_data,train_data, 1000, 5, "relu")

In [None]:
#the following hyperparameters had the best result on test data after multiple iterations 
validation_data = getDataSets(validation_set)
trainTestPlotNN(validation_data,train_data, 100, 5, "relu")

# CUSTOM CONVOLUTIONAL NEURAL NETWORK SECTION


In [None]:
#starter code for this section such as HAM1 network architecture gotten from https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
#gets a dataframe with a sample of the dataset

train_set, test_set, validation_set = getTrainTestVal(hamdf)

class HAMDATALOADER(Dataset):
    def __init__(self, hamdf,transform):
        self.hamdf = hamdf
        self.transform = transform

    def __len__(self):
        return len(self.hamdf)

    def __getitem__(self, index):
        # Load data and get label
        X = Image.open(self.hamdf['image_id'][index])
        y = torch.tensor(int(self.hamdf['dx'][index]))

        if self.transform:
            X = self.transform(X)

        return X, y

In [None]:
#gets test and train dataloaders
def getDataLoaders(hamdf):
    ham_transform = transforms.Compose([transforms.Resize((32,32)),transforms.ToTensor()])
    
    train_set, test_set, validation_set = getTrainTestVal(hamdf)
    
    trains_set = HAMDATALOADER(train_set, transform=ham_transform)
    train_loader = DataLoader(trains_set, batch_size=32, shuffle=True, num_workers=0)
    tests_set = HAMDATALOADER(test_set.reset_index(), transform=ham_transform)
    test_loader = DataLoader(tests_set, shuffle=False, num_workers=0)
    validation_set = HAMDATALOADER(validation_set.reset_index(), transform=ham_transform)
    validation_loader = DataLoader(validation_set, shuffle=False, num_workers=0)
    return train_loader, test_loader, validation_loader

In [None]:
#trains given NN with given trainloader
def trainImageNN(train_loader,net, epochs=10): 
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9)
    for epoch in range(epochs):  # loop over the dataset multiple times

        running_loss = 0.0
        for i, data in enumerate(train_loader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

In [None]:
#gets the classification error of given neural net on given test set in the form of a loader
def getClassificationError(test_loader, net, test_set_length):
    totalcorrect = 0
    dataiter = iter(test_loader)
    for i in range(test_set_length):
        image, label = dataiter.next()
        output = net(image)
        _, predicted = torch.max(output, 1)
        #print(predicted[0])
        if(predicted[0] == label[0]):
            totalcorrect = totalcorrect+1
    return (1-(totalcorrect/test_set_length)) * 100

In [None]:
# base line network arcitecture from https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
class HamNetOne(nn.Module):
    def __init__(self):
        super(HamNetOne, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
# Larger network arcitecture
class HamNetTwo(nn.Module):
    def __init__(self):
        super(HamNetTwo, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(64, 128, 5)
        self.fc1 = nn.Linear(128 * 5 * 5, 1200)
        self.fc2 = nn.Linear(1200, 512)
        self.fc3 = nn.Linear(512, 80)
        self.fc4 = nn.Linear(80, 40)
        self.fc5 = nn.Linear(40, 20)
        self.fc6 = nn.Linear(20, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 128 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = self.fc6(x)
        return x

In [None]:
# smaller network arcitecture
class HamNetThree(nn.Module):
    def __init__(self):
        super(HamNetThree, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 4, 5)
        self.fc1 = nn.Linear(4 * 5 * 5, 20)
        self.fc2 = nn.Linear(20, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 4 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
hamdf = getHamDataFrameWithImagePathOnly(sample_fraction)
train_loader, test_loader, validation_loader = getDataLoaders(hamdf)
net = HamNetOne()
trainImageNN(train_loader,net)
print("classification error for HAMNET 1 on test set  = ",getClassificationError(test_loader, net, len(test_set)), "%")

In [None]:
hamdf = getHamDataFrameWithImagePathOnly(sample_fraction)
train_loader, test_loader, validation_loader = getDataLoaders(hamdf)
net = HamNetTwo()
trainImageNN(train_loader,net)

print("classification error for HAMNET 2 on test set  = ",getClassificationError(test_loader, net, len(test_loader)), "%")

In [None]:
hamdf = getHamDataFrameWithImagePathOnly(sample_fraction)
train_loader, test_loader, validation_loader = getDataLoaders(hamdf)
net = HamNetTwo()
trainImageNN(train_loader,net)

print("classification error for HAMNET 3 on test set  = ",getClassificationError(test_loader, net, len(test_loader)), "%")

In [None]:
#All had the same error so just using the last one
print("classification error for HAMNET on validation set  = ",getClassificationError(validation_loader, net, len(validation_loader)), "%")
print("classification error for HAMNET on training set = ",getClassificationError(train_loader, net, len(train_loader)), "%")

In [None]:
#now do the same but with data less skewed
hamdf = getHamDataFrameWithImagePathOnly(sample_fraction)
hamdf = getMoreEqualSample(hamdf)
train_loader, test_loader, validation_loader = getDataLoaders(hamdf)
net = HamNetOne()
trainImageNN(train_loader,net)
print("classification error for HAMNET 1 on test  set  = ",getClassificationError(test_loader, net, len(test_loader)), "%")

In [None]:
#now do the same but with data less skewed
hamdf = getHamDataFrameWithImagePathOnly(sample_fraction)
hamdf = getMoreEqualSample(hamdf)
train_loader, test_loader, validation_loader = getDataLoaders(hamdf)
net = HamNetTwo()
trainImageNN(train_loader,net)
print("classification error for HAMNET 2 on test set = ",getClassificationError(test_loader, net, len(test_loader)), "%")

In [None]:
#now do the same but with data less skewed
hamdf = getHamDataFrameWithImagePathOnly(sample_fraction)
hamdf = getMoreEqualSample(hamdf)
train_loader, test_loader, validation_loader = getDataLoaders(hamdf)
net = HamNetThree()
trainImageNN(train_loader,net,5)
print("classification error for HAMNET 3 on test set  = ",getClassificationError(test_loader, net, len(test_loader)), "%")

In [None]:
#test validation data set because this model had the best results
hamdf = getHamDataFrameWithImagePathOnly(sample_fraction)
hamdf = getMoreEqualSample(hamdf)
train_loader, test_loader, validation_loader = getDataLoaders(hamdf)
net = HamNetTwo()
trainImageNN(train_loader,net)
print("classification error for HAMNET 2 on validation set = ",getClassificationError(validation_loader, net, len(validation_loader)), "%")
print("classification error for HAMNET 2 on training set = ",getClassificationError(train_loader, net, len(train_loader)), "%")

In [None]:
#THANK YOU SECTION
########################################################################################################
########################################################################################################
########################################################################################################
########################################################################################################

In [None]:
#THANK YOU FOR A GREAT SEMESTER STAY SAFE