In [None]:
import pandas as pd
import numpy as np
#show all columns
pd.set_option('display.max_columns', None)

german_datasets = pd.DataFrame(columns=["text", "label"])
german_datasets = german_datasets.astype({"text": str, "label": np.float32})

In [None]:
news1 = pd.read_csv("data/german/news/RP-Mod.csv")
news1
german_datasets = pd.concat([german_datasets, news1], ignore_index=True)

In [None]:
news2 = pd.read_csv("data/german/news/RP-Crowd-1.csv")
news2
german_datasets = pd.concat([german_datasets, news1[["text", "label"]]], ignore_index=True)

In [None]:
refugee = pd.read_csv("data/german/refugee/german hatespeech refugees.csv")
refugee["text"] = refugee["Tweet"]
refugee["label"] = (refugee["Hatespeech Rating (Expert 2)"]-1)/5
refugee
german_datasets = pd.concat([german_datasets, refugee[["text", "label"]]], ignore_index=True)

In [None]:
comments_df = pd.read_csv("data/german/foreigners/comments.csv")
annotated_comments_df = pd.read_csv("data/german/foreigners/annotated_comments.csv")

# Group by 'comment_id' and calculate the mean valence for each 'comment_id'
grouped_df = annotated_comments_df.groupby("comment_id")["valence"].mean().reset_index()

# Merge the dataframes on 'comment_id'
final_df = pd.merge(comments_df, grouped_df, on="comment_id", how="inner")

# Rename columns to 'text' and 'label'
final_df = final_df.rename(columns={"message": "text", "valence": "label"})

# keep only the columns 'text' and 'label'
final_df = final_df[["text", "label"]]

final_df["label"] = final_df["label"]-1

german_datasets = pd.concat([german_datasets, final_df], ignore_index=True)

In [None]:
hasoc = pd.read_csv("data/german/hasoc/german_dataset.tsv", sep="\t")
# hasoc["task_1"] is always either NOT or HOF
hasoc["label"] = hasoc["task_1"].map({"NOT": 0, "HOF": 1})

german_datasets = pd.concat([german_datasets, hasoc[["text", "label"]]], ignore_index=True)

In [None]:
germeval2018 = pd.read_csv("data/german/germeval2018/germeval2018.training.txt", sep="\t", header=None)
germeval2018.columns = ["text", "label", "label2"]
germeval2018["label"] = germeval2018["label"].map({"OTHER": 0, "OFFENSE": 1})
germeval2018["origin"] = "germeval"

german_datasets = pd.concat([german_datasets, germeval2018[["text", "label", "origin"]]], ignore_index=True)

germeval2018 = pd.read_csv("data/german/germeval2018/germeval2018.test.txt", sep="\t", header=None)
germeval2018.columns = ["text", "label", "label2"]
germeval2018["label"] = germeval2018["label"].map({"OTHER": 0, "OFFENSE": 1})
germeval2018["origin"] = "germeval"

german_datasets = pd.concat([german_datasets, germeval2018[["text", "label", "origin"]]], ignore_index=True)

In [None]:
germeval2019 = pd.read_csv("data/german/germeval2019/Shared-Task-2019_Data_germeval2019.training_subtask1_2.txt", sep="\t", header=None)
germeval2019.columns = ["text", "label", "label2"]
germeval2019["label"] = germeval2019["label"].map({"OTHER": 0, "OFFENSE": 1})
germeval2019["origin"] = "germeval"

german_datasets = pd.concat([german_datasets, germeval2019[["text", "label", "origin"]]], ignore_index=True)

germeval2019 = pd.read_csv("data/german/germeval2019/fz.h-da.de_fileadmin_user_upload_germeval2019GoldLabelsSubtask1_2.txt", sep="\t", header=None)
germeval2019.columns = ["text", "label", "label2"]
germeval2019["label"] = germeval2019["label"].map({"OTHER": 0, "OFFENSE": 1})
germeval2019["origin"] = "germeval"

german_datasets = pd.concat([german_datasets, germeval2019[["text", "label", "origin"]]], ignore_index=True)

In [None]:
german_datasets

In [None]:
# remove duplicated text
german_datasets = german_datasets.drop_duplicates(subset=["text"])
german_datasets = german_datasets.reset_index(drop=True)

#remove nan text
german_datasets = german_datasets.dropna(subset=["text"])
german_datasets

In [None]:
import re
from bs4 import BeautifulSoup
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    # print(text)
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

german_datasets["text"] = german_datasets["text"].apply(text_cleaning)

In [None]:
#shuffle the data
german_datasets = german_datasets.sample(frac=1).reset_index(drop=True)
german_datasets.to_parquet("german_datasets.parquet")

#keep only texts with length above 5
mock = german_datasets[german_datasets["text"].str.len() > 5]
mock_test_set = mock[:2000]
mock_test_set = mock_test_set[["text"]]
mock_test_set.to_csv("mock_test_set.csv", index=False)

mock_test_labels = mock[:2000]
mock_test_labels = mock_test_labels[["label"]]
mock_test_labels.to_csv("mock_test_labels.csv", index=False)

In [None]:
germeval = german_datasets[german_datasets["origin"] == "germeval"]
pretrain = german_datasets[german_datasets["origin"] != "germeval"]



In [None]:
germeval

In [None]:
pretrain

In [None]:
print(germeval["label"].value_counts())
print(pretrain["label"].value_counts())

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(germeval, test_size=0.5)

In [None]:
#save the files to parquet
train.to_parquet("train.parquet")
test.to_parquet("test.parquet")
pretrain.to_parquet("pretrain.parquet")

In [None]:
for (id, row) in train.iterrows():
    print(row["text"])
    print(row["label"])
    print("----------")