#### Importing Dependencies

In [1]:
import json
import string
import nltk
import pandas as pd

from enum import Enum
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from IPython.display import clear_output

# Download NLTK's stopwords
nltk.download('stopwords')

# Download compressed assets from Google Drive
!gdown --id 1TTddIx7Bwwl2o3hYnMKDXxJEskiPMXde
!unrar x "assets.rar"

clear_output()
print('Successfuly downloaded dependencies')

Successfuly downloaded dependencies


#### Enumerating vectorization techniques

In [2]:
class Vectorizer(Enum):
    TF_IDF_VECTORIZER = 'TfidfVectorizer'
    COUNT_VECTORIZER = 'CountVectorizer'
    HASHING_VECTORIZER = 'HashingVectorizer'

#### Mounting tables into a dictionary of dataframes

In [3]:
def get_raw_train_test() -> dict:
    training = pd.read_csv('assets/training_pairs.csv')
    testing = pd.read_csv('assets/testing_pairs.csv')

    return {'train': training, 'test': testing}

#### Fetching textual data from the residual json assets

In [4]:
def __get_json_text_by_id(file_id: str) -> str:
    try:
        file = open(f'assets/webpages/{file_id}.json')
        data = json.load(file)
        return data['text']
    except FileNotFoundError:
        return ''

#### Preprocessing dataframe

In [5]:
def preprocess_df(df: pd.DataFrame):
    # Retrieves textual data by pair_id
    df['Text1'] = df['pair_id'].apply(lambda cell: __get_json_text_by_id(cell.split('_')[0]))
    df['Text2'] = df['pair_id'].apply(lambda cell: __get_json_text_by_id(cell.split('_')[1]))

    # Remove unnecessary columns
    df.drop(df.columns.difference(['Text1', 'Text2', 'Overall']), axis=1, inplace=True)

    # Remove null & empty texts
    df['Text1'].replace('', None, inplace=True)
    df['Text2'].replace('', None, inplace=True)
    df.dropna(subset=['Text1', 'Text2'], inplace=True)

#### Implementing basic natural language processing procedures
* Removing Punctuation
* Removing stops words for both Deutsch and English
* Removing escape sequences
* Lowercasing all characters

In [6]:
def implement_nlp(df: pd.DataFrame):
    remove_punctuation(df)
    remove_stop_words(df)
    remove_escape_sequences(df)
    lowercase_characters(df)

In [7]:
def remove_punctuation(df: pd.DataFrame, columns=None):
    if columns is None:
        columns = ['Text1', 'Text2']

    punctuation = list(string.punctuation)
    for column in columns:
        df[column] = df[column].apply(lambda row: ''.join([i for i in row if i not in punctuation]))

In [8]:
def remove_stop_words(df: pd.DataFrame):
    stop_words_de = stopwords.words('dutch')
    stop_words_en = stopwords.words('english')
    df['Text1'] = df['Text1'].apply(lambda row: ' '.join([i for i in row.split() if i not in stop_words_de]))
    df['Text2'] = df['Text2'].apply(lambda row: ' '.join([i for i in row.split() if i not in stop_words_en]))

In [9]:
def remove_escape_sequences(df: pd.DataFrame, columns=None):
    if columns is None:
        columns = ['Text1', 'Text2']

    escapes = ''.join([chr(char) for char in range(1, 32)])
    for column in columns:
        df[column] = df[column].apply(lambda row: ''.join([i for i in row if i not in escapes]))

In [10]:
def lowercase_characters(df: pd.DataFrame, columns=None):
    if columns is None:
        columns = ['Text1', 'Text2']

    for column in columns:
        df[column] = df[column].str.lower()

#### Converting textual data into an array of vectors based on a predefined vectorization technique

In [11]:
def vectorize_text(df: pd.DataFrame, columns=None, method: Vectorizer = Vectorizer.TF_IDF_VECTORIZER):
    if columns is None:
        columns = ['Text1', 'Text2']

    if method == Vectorizer.TF_IDF_VECTORIZER:
        vectorizer = TfidfVectorizer()
    elif method == Vectorizer.COUNT_VECTORIZER:
        vectorizer = CountVectorizer()
    else:
        vectorizer = HashingVectorizer()
        print('HashingVectorizer appears to have a much higher time complexity due to its high memory usage.')

    count = 1
    for column in columns:
        x = vectorizer.fit_transform(df[column])
        df[f'Vector{count}'] = x.toarray().tolist()
        count = count + 1

#### Methods invocation

In [12]:
train_test = get_raw_train_test()

train = train_test['train']
test = train_test['test']

preprocess_df(train)
preprocess_df(test)

implement_nlp(train)
implement_nlp(test)

print(test['Text1'].head(1).tolist())
print(test['Text2'].head(1).tolist())

vectorize_text(train)
vectorize_text(test)

print(len(train['Vector1'][0]))
print(len(train['Vector2'][0]))

['kommentare zum artikel bitte beachten sie beim verfassen eines kommentars regeln höflicher kommunikation viola crell 03042020 1049 tom erste entschuldigung wenn ich mich einmische aber das ministerium berlin hat ihre hilfe wirklich abgelehnt wenn sie das schriftlich haben dann sollten sie das wirklich veröffentlichen tom erste 03042020 0716 hatte ich vergessen wäre das nicht auch ein artikel für fw redaktion tom erste 02042020 2108 oliver hilgendorff sie dürfen davon ausgehen vorn rollo´s runter kneipe auf privatgrundstück kamera an hintertür und gut ist wenn sich ein grüner bürgermeister aus berlin aus solidarität mit seiner freundin absichtlich ansteckt wenn trotz warnungen import von sogenannten asylbewerbern bei nacht und nebel ungestört weitergeht und wenn gewisse großfamilien bei ihren ausflügen von polizei und ordnungsamt tunlichst ruhe gelassen werden dann kann es auch gar nicht so übel aussehen freiwillige unterstützung ist vom bundesgesundheitsamt offenbar auch nicht gewüns