# Data Preprocessing Danish 

## Libraries

In [None]:
import pkg_resources
import subprocess
import sys
import os
import pandas as pd
from pandas_ods_reader import read_ods

import seaborn as sns
import matplotlib.pyplot as plt 

import nltk
import re, itertools
from string import punctuation

from googletrans import Translator

import emoji

nltk.download('punkt')

import warnings
warnings.filterwarnings("ignore")

## Import data

In [None]:
os.getcwd()

In [None]:
path = os.getcwd()+ "/dkhate/data/"
df = read_ods(path + "dkhate.ods",2)
display(df.head(2))

In [None]:
df = df[["Text", "Translated Text","uid", "Source", "Sub-Task A", "Sub-Task B", "Sub-Task C"]]
df[df["Sub-Task A"] == "not"] = "NOT"
df[df["Sub-Task A"] == "NoT"] = "NOT"
df.head(2)

# Preprocessing

In [None]:
# remove irrelevant punctuation. If lowercasing set string.lowercase() 
def clean_text(string):
    return ''.join(c for c in string if c not in punctuation)

# Reduce orthographic lengthening to two characters
def remove_duplicates(string):
    cleaned = ''.join(''.join(s)[:2] for _, s in itertools.groupby(string))
    return cleaned

def sent_tokenize(string):
    return nltk.sent_tokenize(string, language = "norwegian")

def word_tokenize(string):
    return nltk.word_tokenize(string, language = "norwegian")

def convert_user(string):
    return re.sub(r'(@User|@USER)', 'Navn', string)


In [None]:
test = df.Text.apply(sent_tokenize)
length = test.apply(len)

In [None]:
df["cleaned"] = df.Text.apply(clean_text)
df["cleaned"] = df.cleaned.apply(remove_duplicates)
df["cleaned"].head()

## Filtered dataframes

In [None]:
df.head()

In [None]:
os.getcwd()

In [None]:
df.to_csv('/Users/vildearntzen/Desktop/master_kode/master_kode/data/'+'dk_preprocessed.csv')

In [None]:
noff = df[df["Sub-Task A"] != "OFF"]
off  = df[df["Sub-Task A"] == "OFF"]
tin  = df[df["Sub-Task B"] == "TIN"]
unt  = df[df["Sub-Task B"] == "UNT"]
grp  = df[df["Sub-Task C"] == "GRP"]
ind  = df[df["Sub-Task C"] == "IND"]
oth  = df[df["Sub-Task C"] == "OTH"]

In [None]:
display(len(noff))
display(len(off))
display(len(unt))
display(len(tin))
display(len(grp))
display(len(ind))
display(len(oth)) 

## dk_cleaned.csv

In [None]:
path = '/Users/vildearntzen/Desktop/master_kode/master_kode/data/'
df = pd.read_csv(path + 'dk_preprocessed_translations.csv',  index_col = 0)
df.head(2)

In [None]:
len(df)

In [None]:
df = df[df.cleaned.notna()]

In [None]:
len(df)

## Convert @USER to Navn

In [None]:
dk = pd.read_csv(PATH + 'dk.csv')

translated_cols = {"translatepy_no":"translatepy", "easynmt_no_opus-mt": "opus-mt", "easynmt_no_m2m_100_418M":"418M", "easynmt_no_m2m_100_1.2B": "1.2B"}
for old, new in translated_cols.items():
    dk[new] = dk[old].apply(convert_user)

In [None]:
dk["text"] = dk["text"].apply(convert_user)
dk["cleaned"] = dk["cleaned"].apply(convert_user)

dk.head(15)

## Save dataframe to csv

In [None]:
df = pd.read_csv(path + 'preprocessed.csv')

In [None]:
df.head(3)