In [3]:
import pandas as pd
hate_df_1 = pd.read_csv('./data/korean_hate_speech_1.tsv', delimiter='\t')

In [4]:
hate_df_1.head(5)

Unnamed: 0,comments,contain_gender_bias,bias,hate
0,(현재 호텔주인 심정) 아18 난 마른하늘에 날벼락맞고 호텔망하게생겼는데 누군 계속...,False,others,hate
1,....한국적인 미인의 대표적인 분...너무나 곱고아름다운모습...그모습뒤의 슬픔을...,False,none,none
2,"...못된 넘들...남의 고통을 즐겼던 넘들..이젠 마땅한 처벌을 받아야지..,그래...",False,none,hate
3,"1,2화 어설펐는데 3,4화 지나서부터는 갈수록 너무 재밌던데",False,none,none
4,1. 사람 얼굴 손톱으로 긁은것은 인격살해이고2. 동영상이 몰카냐? 메걸리안들 생각...,True,gender,hate


In [3]:
hate_df_1.shape

(7896, 4)

In [4]:
hate_df_1['vicious'] = 0

for i in hate_df_1.index :
    if (hate_df_1.loc[i, 'contain_gender_bias']=='True') | (hate_df_1.loc[i, 'hate']!='none') :
        hate_df_1.loc[i, 'vicious'] = 1

In [5]:
hate_df_1.isna().sum()

comments               0
contain_gender_bias    0
bias                   0
hate                   0
vicious                0
dtype: int64

In [6]:
hate_df_1['vicious'].value_counts()

1    4410
0    3486
Name: vicious, dtype: int64

In [7]:
from sklearn.model_selection import train_test_split
X_train_h1, X_test_h1, y_train_h1, y_test_h1 = train_test_split(hate_df_1['comments'], hate_df_1['vicious'], random_state=42)

### Base Line model
extracting features and use Naive Bayes (NB) model as base line

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# vect = TfidfVectorizer(ngram_range=(1,2))
# clf = MultinomialNB().fit(X_train_h1, y_train_h1)
text_nb = Pipeline([('tfidf', TfidfVectorizer()),
                    ('clf', MultinomialNB())]
                   )

base_line_h1 = text_nb.fit(X_train_h1, y_train_h1)

In [9]:
from sklearn.metrics import accuracy_score

y_pred_h1 = text_nb.predict(X_test_h1)
accuracy_score(y_test_h1, y_pred_h1)

0.6681864235055724

#### Randomforest model

In [10]:
from sklearn.ensemble import RandomForestClassifier

text_rfc = Pipeline([('tfidf', TfidfVectorizer()),
                    ('clf', RandomForestClassifier())]
                   )

rfc_h1 = text_rfc.fit(X_train_h1, y_train_h1)

In [11]:
y_pred_rfc_h1 = text_rfc.predict(X_test_h1)
accuracy_score(y_test_h1, y_pred_rfc_h1)

0.605369807497467

#### Multi-layer perceptron classifier

In [12]:
from sklearn.neural_network import MLPClassifier

clf_mlp = MLPClassifier(solver='lbfgs'
                   , alpha=1e-5
                   , hidden_layer_sizes=(16,2)
                   , random_state=2
                   )

text_mlp = Pipeline([
        ('tfidf', TfidfVectorizer())
        , ('clf', clf_mlp)
])

mlp_h1 = text_mlp.fit(X_train_h1, y_train_h1)

In [13]:
y_pred_mlp_h1 = text_mlp.predict(X_test_h1)
accuracy_score(y_test_h1, y_pred_mlp_h1)

0.5688956433637284

#### 

### 0. Tokenize
한국어 토큰화 : https://han-py.tistory.com/283

정규표현식으로 한글만 추출할까 했지만 숫자나 puntuation들도 표현의 일부이기 때문에 남겨둠

In [1]:
import jpype #이거 안쓰면 에러남!!!!
from konlpy.tag import Okt

okt = Okt()
token_h1 = []

In [4]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']
for sentence in hate_df_1['comments']:
    temp_X = okt.morphs(sentence, stem=True) # 토큰화
    temp_X = [word for word in temp_X if not word in stopwords] # 불용어 제거
    token_h1.append(temp_X)

In [5]:
token_h1[:3]

[['(',
  '현재',
  '호텔',
  '주인',
  '심정',
  ')',
  '아',
  '18',
  '난',
  '마른하늘',
  '날벼락',
  '맞다',
  '호텔',
  '망하다',
  '생기다',
  '누',
  '군',
  '계속',
  '추모',
  '받다',
  '....'],
 ['....',
  '한국',
  '적',
  '인',
  '미인',
  '대표',
  '적',
  '인',
  '분',
  '...',
  '너무나',
  '곱',
  '고',
  '아름답다',
  '모습',
  '...',
  '그',
  '모습',
  '뒤',
  '슬픔',
  '을',
  '미처',
  '알다',
  'ㅠ'],
 ['...',
  '못',
  '되다',
  '넘다',
  '들다',
  '...',
  '남',
  '고통',
  '을',
  '즐기다',
  '넘다',
  '들다',
  '..',
  '젠',
  '마땅하다',
  '처벌',
  '을',
  '받다',
  '..,',
  '그',
  '래야',
  ',',
  '공정하다',
  '사회',
  '지',
  '...',
  '심다',
  '거두다',
  '...']]

### 1. vectorization (word2vec)
https://wikidocs.net/50739

In [None]:
# import re
# comp_kr = re.compile('[ㄱ-ㅎ]+')
# hate_df_1['vicious_kr'] = comp_kr.findall(hate_df_1['vicious'])

In [None]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

In [None]:
from gensim.models.word2vec import Word2Vec
from konlpy.tag import Okt

In [8]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# tfidf = TfidfVectorizer(stop_words='english', max_features=100)
# dtm_hate_1 = tfidf.fit_transform(hate_df_1['comments'])