In [None]:
#Solve ToxicDetection through feature engineer into logistic regression
# Reference this tutorial for the logistic regression: 
##  https://www.kaggle.com/negation/pytorch-logistic-regression-tutorial
%load_ext autoreload
%autoreload 2


In [None]:
#Input Data needs to be in array of size N x (2)
import utils 
import numpy as np
import pandas as pd
import os
import torch
from torch import nn
USE_GPU = True

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
    USE_GPU = False



In [None]:
pd.set_option('display.max_colwidth', -1)
root_dir = os.path.abspath('.')
data_dir = os.path.join(root_dir, 'dataset')
train = pd.read_csv(os.path.join(data_dir,'train.csv'))
test = pd.read_csv(os.path.join(data_dir, 'test.csv'))


train_x, train_y = utils.featurize(train)
sampleIdx = np.random.choice(np.arange(len(train_y)), 15000, replace = False)
val_x, val_y= train_x[sampleIdx], train_y[sampleIdx]



In [None]:


class LogisticRegression(nn.Module):
    def __init__(self, input_size, num_classes=2, hidden_dim=1024):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_size, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, num_classes)
    def forward(self, x):        
        #m = nn.Softmax()
        n = nn.functional.leaky_relu(self.linear(x))
        n2 = self.linear2(n)
        return n2
        

In [None]:
#train on data_train with several epochs
epochs = 300 #Can also be changed. The higher this value, the more overfit. 
feature_dims = 21
batchSize = 32
classes = 6 #Binaray for toxic, or not
models = []
#model = model.cuda() #COMMENT OUT TO RUN ON GPU
#Todo: set loss criterion and optimizer
#Run loop of number of epoch, forward and backpass, loss then into optimizer

lossFns = []
optimizers = []

#Training
for c in range(classes):
    models.append(LogisticRegression(feature_dims))
    lossFns.append(nn.CrossEntropyLoss())
    optimizers.append(torch.optim.SGD(models[c].parameters(), lr = 1e-3))
    for epoch in range(epochs):
        sampleIdx = np.random.choice(np.arange(len(train_y)), batchSize, replace=False)
        x_s = torch.from_numpy(train_x[sampleIdx]).float().to(device)
        y_s = torch.from_numpy(train_y[sampleIdx]).to(device)
        #for i in range(batchSize): 
            #print(x_s[i].shape, y_s[i].shape)
        optimizers[c].zero_grad()
        scores = models[c](x_s)

        
        loss = lossFns[c](scores, y_s[:,c])
        loss.backward()
        optimizers[c].step()

        if (epoch) % 100 == 0:
                print ('Class: %d, Epoch: [%d/%d], Loss: %.4f' 
                       % (c, epoch, epochs,  loss.data[0]))



    


In [None]:
correctCount = np.zeros((classes))
sampleIdx = np.random.choice(np.arange(len(train_y)), 300, replace = False)
val_x, val_y= train_x[sampleIdx], train_y[sampleIdx]
for c in range(classes):
    
    for i in range(len(val_x)):
        scores = models[c](torch.from_numpy(val_x[i]).float().to(device)).detach().numpy()
        if val_y[i,c]==np.argmax(scores, axis=0):
            correctCount[c] += 1
print(correctCount / len(val_x))


In [None]:
class modelMaker:


    def __init__(self, epochs=300, feature_dims=21, batchSize=32, classes=6, lr=1e-3, useGpu=False):
        #train on data_train with several epochs
        self.epochs = epochs #Can also be changed. The higher this value, the more overfit. 
        self.feature_dims = feature_dims
        self.batchSize = batchSize
        self.classes = classes #Binaray for toxic, or not
        self.lr=lr
        self.models = []
        self.useGpu = useGpu
        self.lossFns = []
        self.optimizers = []

    #Training
    def train(self, tr_x, tr_y):
        losses = np.zeros((self.classes, self.epochs))
        for c in range(self.classes):
            self.models.append(LogisticRegression(self.feature_dims))
            if self.useGpu==True:
                self.models[c] = self.models[c].cuda()
            self.lossFns.append(nn.CrossEntropyLoss())
            self.optimizers.append(torch.optim.SGD(self.models[c].parameters(), lr = self.lr))
            for epoch in range(self.epochs):
                sampleIdx = np.random.choice(np.arange(len(tr_y)), self.batchSize, replace=False)
                x_s = torch.from_numpy(tr_x[sampleIdx]).float().to(device)
                y_s = torch.from_numpy(tr_y[sampleIdx]).to(device)
                self.optimizers[c].zero_grad()
                scores = self.models[c](x_s)
                loss = self.lossFns[c](scores, y_s[:,c])
                loss.backward()
                optimizers[c].step()
                losses[c, epoch] = loss.data[0]
        return losses

    #Test 
    def test(self, ts_x, ts_y):
        correct = np.zeros((self.classes))
        for c in range(self.classes):    
            for i in range(len(ts_y)):
                scores = self.models[c](torch.from_numpy(ts_x[i]).float().to(device)).detach().numpy()
                if ts_y[i,c]==np.argmax(scores, axis=0):
                    correct[c] += 1
        return (correctCount / len(val_x), None) #Accuracy, Recall
                                    #Todo: Wtf is recall in this case lol ??

    


In [None]:
#TODO:
#Optimize hyperparameters like batchsize, or learning rate or
#Try to add dropout, batch normalization to get highest val acc


In [None]:
#TODO: 
#Now try test set, hopefully for accuracy as good as validation set