In [1]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split

In [2]:
train_data = pd.read_csv("data/train.csv")
test_data = pd.read_csv("data/test.csv")

In [68]:
split_word = [x.split(' ') for x in list(train_data['text'])]
split_word_test = [x.split(' ') for x in list(test_data['text'])]
train_data['text_split'] = split_word
test_data['text_split'] = split_word_test

In [71]:
train, valid = train_test_split(train_data, test_size = 0.2, random_state = 42) 

In [82]:
train.to_csv('data/train_use.csv', index=False)
valid.to_csv('data/valid_use.csv', index=False)
test_data.to_csv('data/test_use.csv', index=False)

## Sklearn 

In [75]:
train = pd.read_csv("data/train_use.csv")
valid = pd.read_csv("data/valid_use.csv")
test = pd.read_csv("data/test_use.csv")

In [76]:
from sklearn.feature_extraction.text import HashingVectorizer
hv = HashingVectorizer(n_features=5000)

In [77]:
X_raw = [str(train['keyword'][i]) + ' ' + train['text'][i] for i in range(len(train['keyword']))]
X_raw_valid = [str(valid['keyword'][i]) + ' ' + valid['text'][i] for i in range(len(valid['keyword']))]
X_raw_test = [str(test['keyword'][i]) + ' ' + test['text'][i] for i in range(len(test['keyword']))]
print(len(X_raw))
print(len(X_raw_valid))
print(len(X_raw_test))
print(len(X_raw) + len(X_raw_valid) + len(X_raw_test))

6090
1523
3263
10876


In [79]:
'''
t = pd.DataFrame()
t['words'] = X_raw
t['target'] = list(train['target'])
v = pd.DataFrame()
v['words'] = X_raw_valid
v['target'] = list(valid['target'])
te = pd.DataFrame()
te['words'] = X_raw_test
t.to_csv('data/train_rnn.csv', index=False)
v.to_csv('data/valid_rnn.csv', index=False)
te.to_csv('data/test_rnn.csv', index=False)
'''

"\nt = pd.DataFrame()\nt['words'] = X_raw\nt['target'] = list(train['target'])\nv = pd.DataFrame()\nv['words'] = X_raw_valid\nv['target'] = list(valid['target'])\nte = pd.DataFrame()\nte['words'] = X_raw_test\nt.to_csv('data/train_rnn.csv', index=False)\nv.to_csv('data/valid_rnn.csv', index=False)\nte.to_csv('data/test_rnn.csv', index=False)\n"

In [22]:
X_whole = X_raw.copy()
X_whole.extend(X_raw_valid)
X_whole.extend(X_raw_test)
print(len(X_whole))

10876


In [23]:
X = hv.transform(X_whole).toarray()
X.shape

(10876, 5000)

In [24]:
X_train = X[:6090]
X_valid = X[6090:(6090+1523)]
X_test = X[(6090+1523):]

In [74]:
y_train = list(train['target'])
y_valid = list(valid['target'])

In [10]:
from sklearn.linear_model import LogisticRegression

In [26]:
lr = LogisticRegression(random_state=0).fit(X_train, y_train)
lr.score(X_valid, y_valid)

0.7839789888378201

In [27]:
lr_predict = lr.predict(X_test)
len(lr_predict)

3263

In [41]:
preds_df = pd.DataFrame({"id": test_data["id"], "target": lr_predict})
preds_df.to_csv(f"data/output_lr_15000.csv", index=False)

## RNN

In [40]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset

In [58]:
X_train = np.array(X_train)
X_valid = np.array(X_valid)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_valid = np.array(y_valid)
print(X_train.shape)

(6090, 5000)


In [35]:
featuresTrain = torch.from_numpy(X_train)
targetsTrain = torch.from_numpy(y_train).type(torch.LongTensor)
featuresvalid = torch.from_numpy(X_valid)
targetsvalid = torch.from_numpy(y_valid).type(torch.LongTensor)
featuresTest = torch.from_numpy(X_test)

In [48]:
# Create RNN Model
class RNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(RNNModel, self).__init__()
        # Number of hidden dimensions
        self.hidden_dim = hidden_dim
        # Number of hidden layers
        self.layer_dim = layer_dim
        # RNN
        self.rnn = nn.RNN(input_dim, hidden_dim, layer_dim, batch_first=True, nonlinearity='relu')
        # Readout layer
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = Variable(torch.zeros(self.layer_dim, x.size(0), self.hidden_dim))
        # One time step
        out, hn = self.rnn(x, h0)
        out = self.fc(out[:, -1, :]) 
        return out

In [44]:
batch_size = 100
n_iters = 8000
num_epochs = n_iters / (len(X_train) / batch_size)
num_epochs = int(num_epochs)
print(num_epochs)

131


In [45]:
train = TensorDataset(featuresTrain,targetsTrain)
valid = TensorDataset(featuresvalid,targetsvalid)

In [46]:
train_loader = DataLoader(train, batch_size = batch_size, shuffle = False)
valid_loader = DataLoader(valid, batch_size = batch_size, shuffle = False)

In [56]:
# Create RNN
input_dim = 1    # input dimension
hidden_dim = 100  # hidden layer dimension
layer_dim = 1     # number of hidden layers
output_dim = 2   # output dimension

model = RNNModel(input_dim, hidden_dim, layer_dim, output_dim)

# Cross Entropy Loss 
error = nn.CrossEntropyLoss()

# SGD Optimizer
learning_rate = 0.05
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [57]:
seq_dim = 1 
loss_list = []
iteration_list = []
accuracy_list = []
count = 0
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):

        train  = Variable(images.view(-1, seq_dim, input_dim))
        labels = Variable(labels)
            
        # Clear gradients
        optimizer.zero_grad()
        
        # Forward propagation
        outputs = model(train)
        
        # Calculate softmax and ross entropy loss
        loss = error(outputs, labels)
        
        # Calculating gradients
        loss.backward()
        
        # Update parameters
        optimizer.step()
        
        count += 1
        
        if count % 250 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            # Iterate through test dataset
            for images, labels in valid_loader:
                images = Variable(images.view(-1, seq_dim, input_dim))
                
                # Forward propagation
                outputs = model(images)
                
                # Get predictions from the maximum value
                predicted = torch.max(outputs.data, 1)[1]
                
                # Total number of labels
                total += labels.size(0)
                
                correct += (predicted == labels).sum()
            
            accuracy = 100 * correct / float(total)
            
            # store loss and iteration
            loss_list.append(loss.data)
            iteration_list.append(count)
            accuracy_list.append(accuracy)
            if count % 500 == 0:
                # Print Loss
                print('Iteration: {}  Loss: {}  Accuracy: {} %'.format(count, loss.data[0], accuracy))

RuntimeError: Expected object of scalar type Double but got scalar type Float for argument #2 'mat2' in call to _th_mm