In [8]:
import pandas as pd
import os
import re
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import pickle
import numpy as np
import nltk

In [2]:
german_stop_words = stopwords.words('german')
german_stop_words.append("fur")

In [3]:
# CONSTANTS
DATA_PATH = "D:/10kgerdataset/"
TRAIN_CSV = "train.csv"
TEST_CSV = "test.csv"

In [4]:
try:
    df_train = pd.read_csv(os.path.join(DATA_PATH, TRAIN_CSV))
    df_test = pd.read_csv(os.path.join(DATA_PATH, TEST_CSV))
except FileNotFoundError:
    print("File was not found at specific location.")
    raise

In [5]:
def remove_punctuation(document: str) -> str:
    return re.sub(r'[^\w\s]', '', document)

def remove_numbers(document: str) -> str:
    return re.sub(r'$\d+\W+|\b\d+\b|\W+\d+$', '', document)

def map_umlaut(document: str) -> str:
    umlaut_mapping = {
        "ß": "b",
        "ü": "u",
        "ä": "a",
        "ö": "o",
        "ë": "e",
    }
    for k, v in umlaut_mapping.items():
        document = document.replace(k, v)
    return document

def stop_word_removal(document: str) -> str:
    return " ".join(w for w in document.split() if w not in german_stop_words)

In [9]:
vocab = load_vocab("vocabulary")

In [10]:
def run_pre_processing_pipeline(df, tokenize: bool):
    new_df = df.copy(deep=False)
    
    new_df["text"] = new_df["text"].str.lower()
    new_df["label"] = new_df["label"].str.lower()
    
    new_df = new_df.dropna()
    
    new_df["text"] = new_df["text"].apply(remove_punctuation)
    new_df["text"] = new_df["text"].apply(remove_numbers)
    new_df["text"] = new_df["text"].apply(map_umlaut)
    new_df["text"] = new_df["text"].apply(stop_word_removal)
    
    if tokenize:
        new_df["text"] = new_df["text"].apply(lambda x: x.split())
        new_df["text"] = new_df["text"].apply(lambda x: vocab(x))
    return new_df

In [30]:
def custom_tokenizer(document: str) -> list:
    return document.split()

In [11]:
df_train = run_pre_processing_pipeline(df_train, True)
df_test = run_pre_processing_pipeline(df_test, True)

In [66]:
df_train["text"].apply(lambda x: len(x)).mean()
max_length_batch = df_train["text"][33:64].apply(lambda x: len(x)).max()

In [67]:
max_length_batch

923

In [71]:
b.shape

torch.Size([31, 923])

In [70]:
b = torch.stack([F.pad(torch.tensor(l, dtype=torch.int), (0, max_length_batch - len(l))) for l in df_train['text'][33:64]])

In [72]:
embedding_layer = torch.nn.Embedding(len(vocab), 4, padding_idx=0)

In [73]:
embedding_layer(b)[0]

tensor([[ 1.1612,  0.2437,  0.4255,  0.6899],
        [ 0.1357, -0.5789, -0.4601,  0.8077],
        [-0.8990,  0.6507,  0.9905, -0.4202],
        ...,
        [ 0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  0.0000]], grad_fn=<SelectBackward0>)

In [42]:
import pickle

In [14]:
import torch
from torch.utils.data import DataLoader, Dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [33]:
train_x = np.array(df_train["text"])
vocab = build_vocab_from_iterator(train_x, specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [6]:
def save_vocab(vocab, path):
    output = open(path, 'wb')
    pickle.dump(vocab, output)
    output.close()

def load_vocab(path):
    output = open(path, 'rb')
    vocabulary = pickle.load(output)
    output.close
    return vocabulary

In [45]:
save_vocab(vocab, "vocabulary")

In [52]:
a = load_vocab("vocabulary")

In [63]:
seq = torch.tensor(a(df_train["text"][0]), dtype=torch.int)

In [66]:
seq.shape

torch.Size([221])

In [38]:
import torch.nn.functional as F

In [70]:
seq

tensor([   352,  24088,    226,    204,  14200, 179203, 183228,   4274,      9,
           806,     24,   4515,   4255,   2372, 167108,    261,      3,   1345,
             3,  34016,   2073,  18606, 144188,      8,    554,  34340,   1206,
          8968,   1940,    536,    765,     14,    299,     40,     67,    870,
            51,     64,     17,   1474,      7,    638,  15374,      4,    450,
           575,   2430,    337,   1247,     13,    805,   1505,   4019,  58645,
             7,    509,      7,    553,   4255,   8968,     32,    189,   9917,
           656,     19,   5499,   1345,   2073,     89,   7539,      7,   9582,
         20899, 105111,   2804,    602, 110475,  28057,  28861, 165967,  91221,
            14,    325,     28,   2073,   2372,  34360,     11,     21, 137325,
           113,     28,   2073,   1077,  34340,    325,    294,      3,     34,
          3070,   2178,    188,     47,  92872,   4692,    330,    325,     41,
        180743,    348,    229,  54975, 

In [71]:
F.pad(seq, (1, 1))

tensor([     0,    352,  24088,    226,    204,  14200, 179203, 183228,   4274,
             9,    806,     24,   4515,   4255,   2372, 167108,    261,      3,
          1345,      3,  34016,   2073,  18606, 144188,      8,    554,  34340,
          1206,   8968,   1940,    536,    765,     14,    299,     40,     67,
           870,     51,     64,     17,   1474,      7,    638,  15374,      4,
           450,    575,   2430,    337,   1247,     13,    805,   1505,   4019,
         58645,      7,    509,      7,    553,   4255,   8968,     32,    189,
          9917,    656,     19,   5499,   1345,   2073,     89,   7539,      7,
          9582,  20899, 105111,   2804,    602, 110475,  28057,  28861, 165967,
         91221,     14,    325,     28,   2073,   2372,  34360,     11,     21,
        137325,    113,     28,   2073,   1077,  34340,    325,    294,      3,
            34,   3070,   2178,    188,     47,  92872,   4692,    330,    325,
            41, 180743,    348,    229, 

In [41]:
class GnadDataset(Dataset):
    def __init__(self, x_train, y_train):
        x = run_preprocessing_pipeline(df)
        
        self.x_train = torch.from_numpy(x_train)
        self.y_train = torch.from_numpy(y_train)
    
    def __len__(self):
        return self.x_train.shape[0]
    
    def __getitem__(self, index):
        return self.x_train[index], self.y_train[index]

In [15]:
dataset = GnadDataset(np.array(df_train["text"]), np.array(df_train["label"]))

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

In [17]:
np.array(df_train["text"]).dtype

dtype('O')

In [18]:
import torchtext

ModuleNotFoundError: No module named 'torchtext'