# 데이터 셋 준비
- https://github.com/2runo/Curse-detection-data  # 욕설감지 데이터셋
- https://github.com/jason9693/APEACH # 검증 데이터셋 
- https://github.com/kocohub/korean-hate-speech

In [None]:
import os
from IPython.display import display
%matplotlib inline

os.makedirs("raw", exist_ok=True)
os.makedirs("data/", exist_ok=True)

In [None]:
from urllib import request
import pandas as pd

In [None]:
def download(url, save_path):
  request.urlretrieve(url, save_path)


def download_curse_detection():
  url = "https://raw.githubusercontent.com/2runo/Curse-detection-data/master/dataset.txt"
  download(url, "raw/curse.txt")

  return pd.read_csv("raw/curse.txt", names=["text", "label"], sep="|", on_bad_lines='skip')

curse = download_curse_detection()
curse

In [None]:
from sklearn.model_selection import train_test_split

display(curse.duplicated().sum())
print(curse.label.unique())

In [None]:
curse.label

In [None]:
train, dev = train_test_split(curse, test_size=0.2, shuffle=True, stratify=curse.label, random_state=42)

train.to_csv("data/curse_train.csv", index=False)
dev.to_csv("data/curse_dev.csv", index=False)

train

## 다른 데이터셋 이용 방법

In [None]:
!git clone https://github.com/2runo/Curse-detection-data
!git clone https://github.com/jason9693/APEACH
!git clone https://github.com/kocohub/korean-hate-speech

In [None]:
import os
from IPython.display import display
%matplotlib inline

os.makedirs("raw", exist_ok=True)
os.makedirs("data/", exist_ok=True)

In [None]:
from urllib import request
import pandas as pd

In [None]:
def download(url, save_path):
  request.urlretrieve(url, save_path)


def download_curse_detection():
  url = "https://raw.githubusercontent.com/2runo/Curse-detection-data/master/dataset.txt"
  download(url, "raw/curse.txt")

  return pd.read_csv("raw/curse.txt", names=["text", "label"], sep="|", on_bad_lines='skip')

curse = download_curse_detection()
curse

In [None]:
from sklearn.model_selection import train_test_split

display(curse.duplicated().sum())
print(curse.label.unique())

In [None]:
train, dev = train_test_split(curse, test_size=0.2, shuffle=True, stratify=curse.label, random_state=42)

train.to_csv("data/curse_train.csv", index=False)
dev.to_csv("data/curse_dev.csv", index=False)

train

In [None]:
def download_korean_hate_speach():
  url_train = "https://raw.githubusercontent.com/kocohub/korean-hate-speech/master/labeled/train.tsv"
  url_dev = "https://raw.githubusercontent.com/kocohub/korean-hate-speech/master/labeled/dev.tsv"

  path_train = "raw/korean-hate-speech/train.tsv"
  path_dev = "raw/korean-hate-speech/dev.tsv"
  os.makedirs("raw/korean-hate-speech", exist_ok=True)
  download(url_train, path_train)
  download(url_dev, path_dev)

  train = pd.read_csv(path_train, sep="\t")
  dev = pd.read_csv(path_dev, sep="\t")

  return train, dev

train, dev = download_korean_hate_speach()
display(train)
display(dev)

In [None]:
print(train.comments.duplicated().sum())
print("bias values", train.bias.unique())
print("hate values", train.hate.unique())

display(train.hate.value_counts())
display(train.bias.value_counts())
display(train.contain_gender_bias.value_counts())

In [None]:
from sklearn.preprocessing import LabelEncoder

def prepare_khs(df, bias_le, hate_le):
  df = df.copy()
  df['bias'] = bias_le.transform(df.bias)
  df['hate'] = hate_le.transform(df.hate)
  return df

bias_le = LabelEncoder().fit(train.bias)
hate_le = LabelEncoder().fit(train.hate)
print(bias_le.classes_)
print(hate_le.classes_)

train_khs = prepare_khs(train, bias_le, hate_le)
display(train_khs)
train_khs.to_csv("data/khs_train.csv", index=False)
dev_khs = prepare_khs(dev, bias_le, hate_le)
display(dev_khs)
dev_khs.to_csv("data/khs_dev.csv", index=False)