In [1]:
import pandas as pd
import os
import re
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import pickle
import numpy as np
import torch.nn.functional as F
import nltk
import torch
from torch.utils.data import DataLoader, Dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch import nn

In [2]:
german_stop_words = stopwords.words('german')
german_stop_words.append("fur")

In [3]:
# CONSTANTS
DATA_PATH = "D:/10kgerdataset/"
TRAIN_CSV = "train.csv"
TEST_CSV = "test.csv"
CLASS_TO_IDX = {
    "etat": 0,
    "inland": 1,
    "international": 2,
    "kultur": 3,
    "panorama": 4,
    "sport": 5,
    "web": 6,
    "wirtschaft": 7,
    "wissenschaft": 8
}

In [4]:
try:
    df_train = pd.read_csv(os.path.join(DATA_PATH, TRAIN_CSV))
    df_test = pd.read_csv(os.path.join(DATA_PATH, TEST_CSV))
except FileNotFoundError:
    print("File was not found at specific location.")
    raise

In [5]:
def remove_punctuation(document: str) -> str:
    return re.sub(r'[^\w\s]', '', document)

def remove_numbers(document: str) -> str:
    return re.sub(r'$\d+\W+|\b\d+\b|\W+\d+$', '', document)

def map_umlaut(document: str) -> str:
    umlaut_mapping = {
        "ß": "b",
        "ü": "u",
        "ä": "a",
        "ö": "o",
        "ë": "e",
    }
    for k, v in umlaut_mapping.items():
        document = document.replace(k, v)
    return document

def stop_word_removal(document: str) -> str:
    return " ".join(w for w in document.split() if w not in german_stop_words)

def save_vocab(vocab, path):
    output = open(path, 'wb')
    pickle.dump(vocab, output)
    output.close()

def load_vocab(path):
    output = open(path, 'rb')
    vocabulary = pickle.load(output)
    output.close
    return vocabulary

In [6]:
vocab = load_vocab("vocabulary")

In [7]:
def run_pre_processing_pipeline(df, tokenize: bool):
    new_df = df.copy(deep=False)
    
    new_df["text"] = new_df["text"].str.lower()
    new_df["label"] = new_df["label"].str.lower()
    
    new_df = new_df.dropna()
    
    new_df["text"] = new_df["text"].apply(remove_punctuation)
    new_df["text"] = new_df["text"].apply(remove_numbers)
    new_df["text"] = new_df["text"].apply(map_umlaut)
    new_df["text"] = new_df["text"].apply(stop_word_removal)
    
    if tokenize:
        new_df["text"] = new_df["text"].apply(lambda x: x.split())
        new_df["text"] = new_df["text"].apply(lambda x: vocab(x))
        new_df["label"] = new_df["label"].apply(lambda x: CLASS_TO_IDX[x])
    return new_df

In [8]:
class GnadDataset(Dataset):
    def __init__(self, df):
        try:
            data = run_pre_processing_pipeline(df, True)
            self.x = np.array(data["text"])
            self.y = torch.stack([torch.tensor(label) for label in data["label"]])
        except Exception:
            raise
    
    def __len__(self):
        return self.x.shape[0]
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]

In [9]:
train_dataset = GnadDataset(df_train)

In [10]:
def collate_wrapper(batch):
    batch = list(zip(*batch))
    max_length = max(len(arr) for arr in batch[0])
    
    inp = torch.stack([F.pad(torch.tensor(arr), (0, max_length - len(arr))) for arr in batch[0]])
    tgt = torch.stack(batch[1])
    seq_lengths = torch.stack([torch.tensor(len(arr)) for arr in batch[0]])
    return inp, tgt, seq_lengths

In [11]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_wrapper)

# for batch_ndx, (data, target, seq_lengths) in enumerate(train_dataloader):
#     a = data
#     b = target
#     c = seq_lengths
#     break

In [12]:
class Model(nn.Module):
    def __init__(self, vocab_size, e_dim, h_dim, num_layer, output_dim, is_bidirectional: bool = True):
        super().__init__()
        self.h_dim = h_dim
        self.directions = 2 if is_bidirectional else 1
        self.embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=e_dim, padding_idx=0)
        self.gru_layer = nn.LSTM(input_size=e_dim,
                                hidden_size=h_dim,
                                num_layers=num_layer,
                                batch_first=True,
                                bidirectional=True)
        self.fc_1 = nn.Linear(h_dim * 2, output_dim)
        
    def forward(self, x, seq_lengths):
        x = self.embedding_layer(x)
        x = nn.utils.rnn.pack_padded_sequence(input=x, lengths=seq_lengths, batch_first=True, enforce_sorted=False)
        _, (h_state, c_state) = self.gru_layer(x)
        
        # The outputs of the two directions of the LSTM are concatenated on the last dimension
        x = torch.cat((h_state[-2, :, :], h_state[-1, :, :]), dim=1)
        x = self.fc_1(x)
        return x
        

In [13]:
import torch.optim as optim
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model = Model(vocab_size=len(vocab), e_dim=64, h_dim=128, num_layer=4, output_dim=10)
model.train()
model.to('cuda')
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-2)
num_epoch = 20
def train_model():
    for epoch in range(num_epoch):  # loop over the dataset multiple times
        running_loss = 0.0
        for i, (data, labels, seq_lengths) in enumerate(train_dataloader):
            # get the inputs; data is a list of [inputs, labels]
            data = data.to(device=device)
            labels = labels.to(device=device)
            


            optimizer.zero_grad()

            outputs = model(data, seq_lengths)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 20 == 0:    # print every 200 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0

        print('Finished Training')


cuda


In [14]:
train_model()

[1,     1] loss: 0.001
[1,    21] loss: 0.023
[1,    41] loss: 0.023
[1,    61] loss: 0.022
[1,    81] loss: 0.022
[1,   101] loss: 0.021
[1,   121] loss: 0.021
[1,   141] loss: 0.021
[1,   161] loss: 0.021
[1,   181] loss: 0.021
[1,   201] loss: 0.021
[1,   221] loss: 0.021
[1,   241] loss: 0.021
Finished Training
[2,     1] loss: 0.001
[2,    21] loss: 0.021
[2,    41] loss: 0.021
[2,    61] loss: 0.020
[2,    81] loss: 0.021
[2,   101] loss: 0.020
[2,   121] loss: 0.021
[2,   141] loss: 0.020
[2,   161] loss: 0.021
[2,   181] loss: 0.020
[2,   201] loss: 0.020
[2,   221] loss: 0.020
[2,   241] loss: 0.020
Finished Training
[3,     1] loss: 0.001
[3,    21] loss: 0.020
[3,    41] loss: 0.020
[3,    61] loss: 0.020
[3,    81] loss: 0.020
[3,   101] loss: 0.019
[3,   121] loss: 0.019
[3,   141] loss: 0.020
[3,   161] loss: 0.019
[3,   181] loss: 0.019
[3,   201] loss: 0.019
[3,   221] loss: 0.019
[3,   241] loss: 0.019
Finished Training
[4,     1] loss: 0.001
[4,    21] loss: 0.019
[4,

In [15]:
test_dataset = GnadDataset(df_test)
test_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_wrapper)

In [16]:
d_, l_, s_ = next(iter(test_loader))

In [17]:
d_ = d_.to(device=device)
l_ = l_.to(device=device)

In [18]:
prediction = model(d_, s_)

In [19]:
prediction.shape

torch.Size([32, 10])

In [20]:
torch.argmax(prediction, dim=1), l_

(tensor([2, 7, 8, 4, 2, 6, 2, 4, 4, 5, 8, 1, 8, 6, 0, 4, 4, 3, 4, 4, 2, 2, 1, 2,
         8, 4, 2, 3, 2, 4, 7, 1], device='cuda:0'),
 tensor([2, 7, 8, 4, 4, 6, 4, 2, 1, 5, 8, 1, 8, 6, 0, 2, 7, 3, 4, 4, 2, 2, 1, 2,
         8, 4, 2, 3, 2, 4, 7, 5], device='cuda:0'))

In [None]:
embedding_layer(b).shape
in_features_shape = torch.flatten(embedding_layer(b), start_dim=1).shape

In [None]:
dense_layer = torch.nn.Linear(in_features_shape[1], 8)

In [None]:
train_x = np.array(df_train["text"])
vocab = build_vocab_from_iterator(train_x, specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [None]:
save_vocab(vocab, "vocabulary")

In [None]:
a = load_vocab("vocabulary")

In [None]:
seq = torch.tensor(a(df_train["text"][0]), dtype=torch.int)

In [None]:
seq.shape

In [None]:
seq

In [None]:
F.pad(seq, (1, 1))

In [None]:
dataset = GnadDataset(np.array(df_train["text"]), np.array(df_train["label"]))

In [None]:
np.array(df_train["text"]).dtype

In [None]:
import torchtext