In [1]:
#Solve ToxicDetection through feature engineer into logistic regression
# Reference this tutorial for the logistic regression: 
##  https://www.kaggle.com/negation/pytorch-logistic-regression-tutorial
%load_ext autoreload
%autoreload 2


In [2]:
#Input Data needs to be in array of size N x (2)
import utils 
import numpy as np
import pandas as pd
import os
import torch
from torch import nn
USE_GPU = True

if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')



In [3]:
pd.set_option('display.max_colwidth', -1)
root_dir = os.path.abspath('.')
data_dir = os.path.join(root_dir, 'dataset')
train = pd.read_csv(os.path.join(data_dir,'train.csv'))
test = pd.read_csv(os.path.join(data_dir, 'test.csv'))


train_x, train_y = utils.featurize(train)
sampleIdx = np.random.choice(np.arange(len(train_y)), 15000, replace = False)
val_x, val_y= train_x[sampleIdx], train_y[sampleIdx]



In [29]:


class LogisticRegression(nn.Module):
    def __init__(self, input_size, num_classes=2, hidden_dim=1024):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_size, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, num_classes)
    def forward(self, x):
        
        m = nn.Softmax()
        n = nn.functional.leaky_relu(self.linear(x))
        n2 = self.linear2(n)
        
        return m(n2)
        

In [44]:
#train on data_train with several epochs
epochs = 500 #Can also be changed. The higher this value, the more overfit. 
feature_dims = 21
batchSize = 64
classes = 6 #Binaray for toxic, or not
model = LogisticRegression(feature_dims, num_classes=classes)
#model = model.cuda() #COMMENT OUT TO RUN ON GPU
#Todo: set loss criterion and optimizer
#Run loop of number of epoch, forward and backpass, loss then into optimizer
lossFn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 1e-3)

#Training
for epoch in range(epochs):
    sampleIdx = np.random.choice(np.arange(len(train_y)), batchSize, replace=False)
    x_s = torch.from_numpy(train_x[sampleIdx]).float().to(device)
    y_s = torch.from_numpy(train_y[sampleIdx]).to(device)
    #for i in range(batchSize): 
        #print(x_s[i].shape, y_s[i].shape)
    optimizer.zero_grad()
    scores = model(x_s)
    

    loss = lossFn(scores, torch.max(y_s,1)[1])
    loss.backward()
    optimizer.step()
        
    if (epoch+1) % 5 == 0:
            print ('Epoch: [%d/%d], Loss: %.4f' 
                   % (epoch+1, epochs,  loss.data[0]))



    


  


Epoch: [5/500], Loss: 1.7387
Epoch: [10/500], Loss: 1.6776
Epoch: [15/500], Loss: 1.6492
Epoch: [20/500], Loss: 1.6442
Epoch: [25/500], Loss: 1.5925
Epoch: [30/500], Loss: 1.5432
Epoch: [35/500], Loss: 1.5261
Epoch: [40/500], Loss: 1.5264
Epoch: [45/500], Loss: 1.4983
Epoch: [50/500], Loss: 1.4769
Epoch: [55/500], Loss: 1.4767
Epoch: [60/500], Loss: 1.5307
Epoch: [65/500], Loss: 1.4810
Epoch: [70/500], Loss: 1.4390
Epoch: [75/500], Loss: 1.4498
Epoch: [80/500], Loss: 1.3895
Epoch: [85/500], Loss: 1.4187
Epoch: [90/500], Loss: 1.4641
Epoch: [95/500], Loss: 1.4477
Epoch: [100/500], Loss: 1.3612
Epoch: [105/500], Loss: 1.4020
Epoch: [110/500], Loss: 1.4398
Epoch: [115/500], Loss: 1.3930
Epoch: [120/500], Loss: 1.4204
Epoch: [125/500], Loss: 1.4164
Epoch: [130/500], Loss: 1.4009
Epoch: [135/500], Loss: 1.3658
Epoch: [140/500], Loss: 1.3393
Epoch: [145/500], Loss: 1.3383
Epoch: [150/500], Loss: 1.4014
Epoch: [155/500], Loss: 1.3667
Epoch: [160/500], Loss: 1.4042
Epoch: [165/500], Loss: 1.31

In [47]:
correct = 0
total = 0
sampleIdx = np.random.choice(np.arange(len(train_y)), 15000, replace = False)
val_x, val_y= train_x[sampleIdx], train_y[sampleIdx]



15000

In [61]:
count = 0
for i in range(len(val_x)):
    scores = model(torch.from_numpy(val_x[i]).float().to(device))
    if(count>=10):
        break
    count+=1
    print("predict: ",scores)
    print("test: ",val_y[i])

predict:  tensor([0.9278, 0.0146, 0.0143, 0.0155, 0.0132, 0.0145], grad_fn=<SoftmaxBackward>)
test:  [1 0 1 0 1 1]
predict:  tensor([1.0000e+00, 7.9817e-07, 8.6995e-07, 6.9314e-07, 9.9553e-07, 1.1195e-06],
       grad_fn=<SoftmaxBackward>)
test:  [0 0 0 0 0 0]
predict:  tensor([0.9990, 0.0002, 0.0002, 0.0002, 0.0002, 0.0002], grad_fn=<SoftmaxBackward>)
test:  [0 0 0 0 0 0]
predict:  tensor([0.8913, 0.0219, 0.0212, 0.0212, 0.0220, 0.0223], grad_fn=<SoftmaxBackward>)
test:  [0 0 0 0 0 0]
predict:  tensor([0.3481, 0.1343, 0.1279, 0.1351, 0.1265, 0.1281], grad_fn=<SoftmaxBackward>)
test:  [0 0 0 0 0 0]
predict:  tensor([0.7488, 0.0507, 0.0491, 0.0504, 0.0495, 0.0514], grad_fn=<SoftmaxBackward>)
test:  [1 0 0 0 0 0]
predict:  tensor([0.9493, 0.0103, 0.0097, 0.0104, 0.0100, 0.0102], grad_fn=<SoftmaxBackward>)
test:  [0 0 0 0 0 0]
predict:  tensor([0.9860, 0.0028, 0.0027, 0.0026, 0.0028, 0.0029], grad_fn=<SoftmaxBackward>)
test:  [0 0 0 0 0 0]
predict:  tensor([0.5333, 0.0953, 0.0905, 0.0969,

  


In [39]:
#TODO:
#Optimize hyperparameters like batchsize, or learning rate or
#Try to add dropout, batch normalization to get highest val acc


In [7]:
#TODO: 
#Now try test set, hopefully for accuracy as good as validation set