In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

path = '/kaggle/input/assignment-4-data'
train_x_path = path + '/train_x.csv'
train_y_path = path + '/train_y.csv'
test_x_path = path + '/non_comp_test_x.csv'
test_y_path = path + '/non_comp_test_y.csv'


In [4]:
#now we will load our data
import torch
from torchtext.vocab import GloVe
embedding_glove = GloVe(name='6B', dim=300)


.vector_cache/glove.6B.zip: 862MB [02:43, 5.28MB/s]                               
100%|█████████▉| 399999/400000 [00:45<00:00, 8830.70it/s]


In [5]:
from torchtext.data import get_tokenizer
tokenizer = get_tokenizer('basic_english')
train_x_data = ((pd.read_csv(train_x_path))['Title'])
train_y_data = ((pd.read_csv(train_y_path))['Genre']).to_numpy()
test_x_data = ((pd.read_csv(test_x_path))['Title'])
test_y_data = ((pd.read_csv(test_y_path))['Genre']).to_numpy()
device = "cpu"
print(f"Using {device} device")
batch_size = 300
embedding_size = 10
def tokenize(data,max_length):
    total_points = len(data)
    final_dataset = torch.zeros([total_points,max_length,300]).to(device)
    for point in range(total_points):
        sentence = data[point]
        word_list = tokenizer(sentence)
        value = embedding_glove.get_vecs_by_tokens(word_list)
        total_words = len(word_list)
        if total_words >= max_length:
            final_dataset[point,:,:] = value[0:max_length,:]
        else:
            final_dataset[point,:total_words,:] = value
    return final_dataset
train_embedding = tokenize(train_x_data,embedding_size)
test_embedding = tokenize(test_x_data,embedding_size)

Using cpu device


In [6]:
import torch.nn as nn
import torch.optim as optim

In [7]:
class BidirectionalRNN(nn.Module):
    def __init__(self,input_size,hidden_size,num_layers,total_classes,sequence_length):
        super(BidirectionalRNN,self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.total_classes = total_classes
        self.sequence_length = sequence_length
        self.bidirectionalRNN = nn.RNN(input_size,hidden_size,num_layers,batch_first = True,bidirectional = True)
        #we will put the context vector as input
        self.fc1 = nn.Linear(2*hidden_size,128)
        self.fc2 = nn.Linear(128,total_classes)
    def forward(self,x):
        batch_size = x.size(0)
        h0 = torch.zeros([2*self.num_layers,batch_size,self.hidden_size],dtype= torch.float).to(device)
        output,final_hidden = self.bidirectionalRNN(x,h0)
        output = self.fc1(output[:,-1,:])
        output = (nn.Tanh())(output)
        output = self.fc2(output)
        return output
neural_net = BidirectionalRNN(300,128,1,30,embedding_size).to(device)

In [8]:
loss_fn = nn.CrossEntropyLoss()
epochs = 25
learning_rate = 0.0006
optimizer = optim.Adam(neural_net.parameters(), lr = learning_rate)
neural_net.zero_grad()

In [9]:
def accuracy(input,output):
    prediction = neural_net(input)
    points = len(output)
    correct = 0
    for i in range(points):
        index = torch.argmax(prediction[i,:])
        if index == output[i]:
            correct += 1
    return (100*correct)/(points)
def train_model():
    total_points = train_embedding.size(0)
    for i in range(epochs):
        for j in range(0,total_points,batch_size):
            target = 0
            data_points = 0
            if j + batch_size <= total_points:
                data_points = train_embedding[j:(j+batch_size),:,:]
                target = torch.tensor(train_y_data[j:j+batch_size]).to(device)
            else:
                data_points = train_embedding[j:,:,:]
                target = torch.tensor(train_y_data[j:]).to(device)
            optimizer.zero_grad()
            output = neural_net(data_points).to(device)
            loss = loss_fn(output, target)
            loss.backward()
            optimizer.step()
train_model()
print(f"Train accuracy after epoch {epochs} is {accuracy(train_embedding,train_y_data)}")
print(f"Test accuracy after epoch {epochs} is {accuracy (test_embedding,test_y_data)}")
print('Training Complete!')

Train accuracy after epoch 25 is 57.71637426900585
Test accuracy after epoch 25 is 43.80701754385965
Training Complete!


In [11]:
comp_test = tokenize(test_x_data,embedding_size)
predictions = neural_net(comp_test)
def output_file_creator(predictions):
    id_value = []
    genre = []
    points = predictions.size(0)
    for i in range(points):
        id_value.append(i)
        index = torch.argmax(predictions[i,:])
        genre.append(index.item())
    dictCsv = {'Id' : id_value,'Genre':genre}
    df1 = pd.DataFrame(dictCsv)
    df1.to_csv('non_comp_test_pred_y.csv',index = False)
output_file_creator(predictions)