In [1]:
import pandas as pd
import torch
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader, random_split

SEED = 1003
generator = torch.Generator().manual_seed(SEED)

In [2]:
class DataSet(Dataset):
    
    def __init__(self, tensors_loc, df_loc):
        self.tensors = torch.load(tensors_loc)
        self.df = pd.read_csv(df_loc)
        self.labels = torch.from_numpy(self.df.loc[:, 'ukrainian'].values).float()
        self.msg = self.df.loc[:, 'msg'].values
        self.normalized_tensors = (self.tensors - self.tensors.mean(0)) / self.tensors.std(0)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.tensors[idx], self.labels[idx]

In [3]:
dataset = DataSet('word_tensors.pt', 'processed.csv')
train_dataset, val_dataset, test_dataset = random_split(dataset, lengths=[.8, .1, .1], generator=generator)
train_mean = train_dataset.dataset.tensors[train_dataset.indices].mean(0)
train_std = train_dataset.dataset.tensors[train_dataset.indices].std(0)

train_dataset.dataset.tensors[train_dataset.indices] -= train_mean
val_dataset.dataset.tensors[val_dataset.indices] -= train_mean
test_dataset.dataset.tensors[test_dataset.indices] -= train_mean

train_dataset.dataset.tensors[train_dataset.indices] /= train_std
val_dataset.dataset.tensors[val_dataset.indices] /= train_std
test_dataset.dataset.tensors[test_dataset.indices] /= train_std
len(dataset), len(train_dataset)

(138059, 110448)

In [4]:
train = DataLoader(train_dataset, batch_size=1_000, shuffle=True)
val = DataLoader(val_dataset, batch_size=1_000, shuffle=True)
test = DataLoader(test_dataset, batch_size=1_000, shuffle=True)

In [5]:
class Model(nn.Module):
    
    def __init__(self, dropout=.25):
        super().__init__()
        self.dropout = nn.Dropout(dropout) 
        self.fc1 = nn.Linear(300, 64)
        self.bn1 = nn.BatchNorm1d(64)
        self.fc2 = nn.Linear(64, 16)
        self.bn2 = nn.BatchNorm1d(16)
        self.fc3 = nn.Linear(16, 1)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.bn1(x)
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.bn2(x)
        x = self.dropout(x)
        x = F.sigmoid(self.fc3(x)).view(-1)
        return x

In [6]:
model = Model()
L = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), weight_decay=1e-5)

In [7]:
def train_one_epoch(epoch_index):
    global train, L, optimizer, model
    train_loss = 0.

    for i, (x, labels) in enumerate(train):
        out = model(x)
        loss = L(out, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    return train_loss / (i + 1)

In [8]:
EPOCHS = 50

best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    
    model.train()
    train_loss = train_one_epoch(epoch)
    

    model.eval()
    val_loss = 0.0
    for i, (x, labels) in enumerate(val):
        out = model(x)
        vloss = L(out, labels)
        val_loss += vloss.item()
    
    x, labels = val_dataset[:]
    out = model(x)
    
    if epoch % (EPOCHS // 10) == 0:
        print('EPOCH {}:'.format(epoch + 1))
        print(f'{100 * (out.round() == labels).float().mean().item():.1f}%')
        print('LOSS train {:.3f} valid {:.3f}'.format(train_loss, val_loss / (i + 1)))

EPOCH 1:
72.4
LOSS train 0.602 valid 0.546
EPOCH 6:
74.9
LOSS train 0.504 valid 0.503
EPOCH 11:
75.6
LOSS train 0.485 valid 0.492
EPOCH 16:
75.9
LOSS train 0.470 valid 0.488
EPOCH 21:
76.1
LOSS train 0.460 valid 0.488
EPOCH 26:
76.2
LOSS train 0.452 valid 0.484
EPOCH 31:
76.8
LOSS train 0.447 valid 0.486
EPOCH 36:
76.4
LOSS train 0.441 valid 0.485
EPOCH 41:
77.0
LOSS train 0.437 valid 0.482
EPOCH 46:
76.6
LOSS train 0.435 valid 0.484


In [11]:
x, labels = test_dataset[:]
out = model(x)
print('Final Test Accuracy')
print(f'{100 * (out.round() == labels).float().mean().item():.1f}%')

Final Test Accuracy
76.8%


In [12]:
torch.save(model.state_dict(), 'DL_model_on_DL_vectors.pt')