In [9]:
import torch
import math
from d2l import torch as d2l
from torch import nn
import pandas as pd


In [10]:
def get_train_data():
    training_data = pd.read_csv('data\\EXIST2021_training.tsv', sep='\t')
    train_text, train_label = [], []
    for index in range(3437):
        items = training_data.iloc[index]
        train_text.append(items['text'])
        train_label.append(1 if items['task1'] == 'sexist' else 0)
    return train_text, train_label

def get_test_data():
    testing_data = pd.read_csv('data\\EXIST2021_test_labeled.tsv', sep='\t')
    test_text, test_label = [], []
    for index in range(1000):
        item = testing_data.iloc[index]
        test_text.append(item['text'])
        test_label.append(1 if item['task1'] == 'sexist' else 0)
    return test_text, test_label

In [14]:
def load_data(batch_size, num_steps=280):
    train_data = get_train_data()
    test_data = get_test_data()
    train_tokens = d2l.tokenize(train_data[0], token='word')
    test_tokens = d2l.tokenize(test_data[0], token='word')
    vocab = d2l.Vocab(train_tokens, min_freq=5)
    print(train_tokens[0])
    train_features = torch.tensor([d2l.truncate_pad(
        vocab[line], num_steps, vocab['<pad>']) for line in train_tokens])
    test_features = torch.tensor([d2l.truncate_pad(
        vocab[line], num_steps, vocab['<pad>']) for line in test_tokens])
    train_iter = d2l.load_array((train_features, torch.tensor(train_data[1])), batch_size)
    test_iter = d2l.load_array((test_features, torch.tensor(test_data[1])), batch_size, is_train=False)
    return train_iter, test_iter, vocab

In [11]:
glove_embedding = d2l.TokenEmbedding('glove.42b.300d')

In [12]:
class BiRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, **kwargs):
        super(BiRNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.encoder = nn.LSTM(embed_size, num_hiddens, num_layers=num_layers, bidirectional=True, dropout=0.5)
        self.decoder = nn.Linear(num_hiddens * 4, 2)

    def forward(self, inputs):
        embeddings = self.embedding(inputs.T)
        self.encoder.flatten_parameters()
        outputs, _ = self.encoder(embeddings)
        encoding = torch.cat((outputs[0], outputs[-1]), dim=1)
        outs = self.decoder(encoding)
        return outs

In [15]:
batch_size = 128
train_iter, test_iter, vocab = load_data(batch_size)

['She', 'calls', 'herself', '"anti-feminazi"', 'how', 'about', 'shut', 'the', 'fucking', 'up', 'on', 'your', 'vile', 'commentary', 'on', 'an', 'elderly', 'responsible', 'citizen', 'tu', 'sach', 'muuch', 'ghani', 'baawri-bewdi', 'hai', 'bey', 'https://t.co/ZMxTDwsY5D']


In [13]:
vocab['commentary']


0

In [14]:
embed_size, num_hiddens, num_layers = 300, 128, 1
devices = d2l.try_all_gpus()
net = BiRNN(len(vocab), embed_size, num_hiddens, num_layers)

def init_weights(m):
    if type(m) == nn.Linear:
        nn.init.xavier_normal_(m.weight)
    if type(m) == nn.LSTM:
        for param in m._flat_weights_names:
            if "weight" in param:
                nn.init.xavier_normal_(m._parameters[param])

net.apply(init_weights)

BiRNN(
  (embedding): Embedding(2106, 300)
  (encoder): LSTM(300, 128, dropout=0.5, bidirectional=True)
  (decoder): Linear(in_features=512, out_features=2, bias=True)
)

In [15]:
embeds = glove_embedding[vocab.idx_to_token]
embeds.shape

torch.Size([2106, 300])

In [16]:
net.embedding.weight.data.copy_(embeds)
net.embedding.weight.requires_grad = False

In [24]:
import torch
X = torch.tensor([[0.9,0.1],[0.4, 0.6]])
import numpy as np

In [26]:


from d2l import torch as d2l
device = d2l.try_gpu()
tt = X.to(device)
tt = np.argmax(tt.cpu(), axis=1)
y =  torch.tensor([1, 1])
y = y.to(device)


In [23]:
from sklearn import metrics
metrics.f1_score(tt.cpu().numpy(), y.cpu().numpy())

0.6666666666666666

In [6]:
def sldd(tokens):
    return [12,12,12], [11,11,11]

ss = sldd(12)
ss + ([11,1,1],)

([12, 12, 12], [11, 11, 11], [11, 1, 1])