In [None]:
import pandas as pd
import os
import re
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import numpy as np
import nltk

In [None]:
german_stop_words = stopwords.words('german')
german_stop_words.append("fur")

In [None]:
# CONSTANTS
DATA_PATH = "D:/10kgerdataset/"
TRAIN_CSV = "train.csv"
TEST_CSV = "test.csv"

In [None]:
try:
    df_train = pd.read_csv(os.path.join(DATA_PATH, TRAIN_CSV))
    df_test = pd.read_csv(os.path.join(DATA_PATH, TEST_CSV))
except FileNotFoundError:
    print("File was not found at specific location.")
    raise

In [None]:
def remove_punctuation(document: str) -> str:
    return re.sub(r'[^\w\s]', '', document)

def remove_numbers(document: str) -> str:
    return re.sub(r'$\d+\W+|\b\d+\b|\W+\d+$', '', document)

def map_umlaut(document: str) -> str:
    umlaut_mapping = {
        "ß": "b",
        "ü": "u",
        "ä": "a",
        "ö": "o",
        "ë": "e",
    }
    for k, v in umlaut_mapping.items():
        document = document.replace(k, v)
    return document

def stop_word_removal(document: str) -> str:
    return " ".join(w for w in document.split() if w not in german_stop_words)

In [None]:
def run_pre_processing_pipeline(df):
    new_df = df.copy(deep=False)
    
    new_df["text"] = new_df["text"].str.lower()
    new_df["label"] = new_df["label"].str.lower()
    
    new_df = new_df.dropna()
    
    new_df["text"] = new_df["text"].apply(remove_punctuation)
    new_df["text"] = new_df["text"].apply(remove_numbers)
    new_df["text"] = new_df["text"].apply(map_umlaut)
    new_df["text"] = new_df["text"].apply(stop_word_removal)
    return new_df

In [None]:
df_train = run_pre_processing_pipeline(df_train)
df_test = run_pre_processing_pipeline(df_test)

In [None]:
text_features = ["text"]

In [None]:
x_train = np.array(df_train["text"])
y_train = np.array(df_train["label"])
x_test = np.array(df_test["text"])
y_test = np.array(df_test["label"])

In [None]:
from catboost import Pool, CatBoostClassifier

In [None]:
from sklearn.utils.class_weight import compute_class_weight
 
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)

In [None]:
train_pool = Pool(
    x_train, 
    y_train, 
    text_features=[0]
)
valid_pool = Pool(
    x_test, 
    y_test,
    text_features=[0]

)

catboost_params = {
    'iterations': 500,
    'learning_rate': 0.2,
    'eval_metric': 'Accuracy',
    'task_type': 'GPU',
    'early_stopping_rounds': 300,
    'use_best_model': True,
    'verbose': 500,
}

model = CatBoostClassifier(**catboost_params)
model.fit(train_pool, eval_set=valid_pool, plot=True)

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix    

In [None]:
y_pred = model.predict(valid_pool)
y_pred = y_pred.reshape(y_pred.shape[0])
y_true = y_test
labels = np.unique(y_true)

In [None]:
y_true.shape, y_pred.shape, labels.shape

In [None]:
print(classification_report(y_true, y_pred, target_names=labels))

In [None]:
cf_matrix = confusion_matrix(y_true, y_pred)
cf_matrix

In [None]:
fig_dims = (15, 15)
fig, ax = plt.subplots(figsize=fig_dims)
plt.title('Category distribution over the testing dataset', fontsize=25)
plt.xlabel('label', fontsize=24)
plt.ylabel('count', fontsize=24)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18) 
ax = sns.heatmap(cf_matrix, annot=True, fmt='d')

# kfold training

In [None]:
from sklearn.model_selection import KFold

In [None]:
df = pd.read_csv(os.path.join(DATA_PATH, "articles.csv"))

In [None]:
df = run_pre_processing_pipeline(df)

In [None]:
x = np.array(df["text"])
y = np.array(df["label"])

In [None]:
kf = KFold(n_splits=5, shuffle=True)
models = []

In [None]:
for i, (train_index, valid_index) in enumerate(kf.split(x)):
    x_train, y_train = x[train_index], y[train_index]
    x_test, y_test = x[valid_index], y[valid_index]
    
    train_pool = Pool(
        x_train, 
        y_train, 
        text_features=[0]
    )
    valid_pool = Pool(
        x_test, 
        y_test,
        text_features=[0]
    )
    
    model = CatBoostClassifier(**catboost_params)
    model.fit(train_pool, eval_set=valid_pool)
    models.append(model)