In [1]:
import pandas as pd
import os
import re
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import pickle
import numpy as np
import torch.nn.functional as F
import nltk
import torch
from torch.utils.data import DataLoader, Dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [2]:
german_stop_words = stopwords.words('german')
german_stop_words.append("fur")

In [3]:
# CONSTANTS
DATA_PATH = "D:/10kgerdataset/"
TRAIN_CSV = "train.csv"
TEST_CSV = "test.csv"
CLASS_TO_IDX = {
    "etat": 0,
    "inland": 1,
    "international": 2,
    "kultur": 3,
    "panorama": 4,
    "sport": 5,
    "web": 6,
    "wirtschaft": 7,
    "wissenschaft": 8
}

In [4]:
try:
    df_train = pd.read_csv(os.path.join(DATA_PATH, TRAIN_CSV))
    df_test = pd.read_csv(os.path.join(DATA_PATH, TEST_CSV))
except FileNotFoundError:
    print("File was not found at specific location.")
    raise

In [5]:
def remove_punctuation(document: str) -> str:
    return re.sub(r'[^\w\s]', '', document)

def remove_numbers(document: str) -> str:
    return re.sub(r'$\d+\W+|\b\d+\b|\W+\d+$', '', document)

def map_umlaut(document: str) -> str:
    umlaut_mapping = {
        "ß": "b",
        "ü": "u",
        "ä": "a",
        "ö": "o",
        "ë": "e",
    }
    for k, v in umlaut_mapping.items():
        document = document.replace(k, v)
    return document

def stop_word_removal(document: str) -> str:
    return " ".join(w for w in document.split() if w not in german_stop_words)

def save_vocab(vocab, path):
    output = open(path, 'wb')
    pickle.dump(vocab, output)
    output.close()

def load_vocab(path):
    output = open(path, 'rb')
    vocabulary = pickle.load(output)
    output.close
    return vocabulary

In [6]:
vocab = load_vocab("vocabulary")

In [7]:
def run_pre_processing_pipeline(df, tokenize: bool):
    new_df = df.copy(deep=False)
    
    new_df["text"] = new_df["text"].str.lower()
    new_df["label"] = new_df["label"].str.lower()
    
    new_df = new_df.dropna()
    
    new_df["text"] = new_df["text"].apply(remove_punctuation)
    new_df["text"] = new_df["text"].apply(remove_numbers)
    new_df["text"] = new_df["text"].apply(map_umlaut)
    new_df["text"] = new_df["text"].apply(stop_word_removal)
    
    if tokenize:
        new_df["text"] = new_df["text"].apply(lambda x: x.split())
        new_df["text"] = new_df["text"].apply(lambda x: vocab(x))
        new_df["label"] = new_df["label"].apply(lambda x: CLASS_TO_IDX[x])
    return new_df

In [13]:
class GnadDataset(Dataset):
    def __init__(self, df):
        try:
            data = run_pre_processing_pipeline(df, True)
            self.x = data["text"]
            self.y = torch.stack([torch.tensor(label, dtype=torch.int) for label in data["label"]])
        except Exception:
            raise
    
    def __len__(self):
        return self.x.shape[0]
    
    def __getitem__(self, index):
        return self.x[index], self.y[index]

In [78]:
class CustomBatch(object):
    def __init__(self, data):
        data = list(zip(*data))
        self.inp = self.__init_inputs(data[0])
        self.tgt = torch.stack(data[1])
    
    def __init_inputs(self, data):
        max_length = max(len(arr) for arr in data)
        data = torch.stack([F.pad(torch.tensor(arr, dtype=torch.int), (0, max_length - len(arr))) for arr in data])
        return data

In [79]:
def collate_wrapper(batch):
    return CustomBatch(batch)

In [80]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_wrapper)

for batch_ndx, sample in enumerate(train_dataloader):
    a = sample.inp
    b = sample.tgt
    break

873


In [85]:
a[0]

tensor([    22,  47824,  48713,     14,   4618, 109712,   4618,    607,    272,
          1858,   5423,    952,   4530,    237,     25, 175118,   3496,  22377,
         17507,   6061,   1656,   1389,  14276,    371,  10199,     79,    293,
            22,    401,  47824,  48713,     53,   4618,     14,   1296,   2967,
         48713,    271,   7653,     28,    984,   6996,   1978,    332,     14,
            57,   1211,   2185,   2223,    930,    926,   7085,  17507,    237,
          5947,  17507, 106266,    884,   1641,  86297,  45064,   7721,   1778,
           107,   9293,   2077,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0, 

In [14]:
train_dataset = GnadDataset(df_train)

In [26]:
train_dataset[:32][0].apply(lambda x: len(x)).max()

684

AttributeError: 'tuple' object has no attribute 'apply'

In [72]:
df_train = run_pre_processing_pipeline(df_train, True)
df_test = run_pre_processing_pipeline(df_test, True)

In [None]:
labels = torch.stack([torch.tensor(category, dtype=torch.int) for category in df_train.label])

In [None]:
max_length_batch

In [73]:
df_train["text"].apply(lambda x: len(x)).mean()
max_length_batch = df_train["text"][start_idx:start_idx + batch_size].apply(lambda x: len(x)).max()

NameError: name 'start_idx' is not defined

In [None]:
batch_size = 32
start_idx = 0
max_length_batch

In [None]:
b = torch.stack([F.pad(torch.tensor(l, dtype=torch.int), (0, max_length_batch - len(l))) for l in df_train['text'][start_idx:start_idx+batch_size]])

In [None]:
b.shape

In [None]:
embedding_layer = torch.nn.Embedding(len(vocab), 4, padding_idx=0)

In [None]:
embedding_layer(b).shape
in_features_shape = torch.flatten(embedding_layer(b), start_dim=1).shape

In [None]:
dense_layer = torch.nn.Linear(in_features_shape[1], 8)

In [None]:
train_x = np.array(df_train["text"])
vocab = build_vocab_from_iterator(train_x, specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [None]:
save_vocab(vocab, "vocabulary")

In [None]:
a = load_vocab("vocabulary")

In [None]:
seq = torch.tensor(a(df_train["text"][0]), dtype=torch.int)

In [None]:
seq.shape

In [None]:
seq

In [None]:
F.pad(seq, (1, 1))

In [None]:
dataset = GnadDataset(np.array(df_train["text"]), np.array(df_train["label"]))

In [None]:
np.array(df_train["text"]).dtype

In [None]:
import torchtext