## **FastText Embedding Model 생성**
- 변형된 비속어를 학습하기 위해 char 단위로 학습하는 fasttext를 사용
- 좌우 단어에 따라(문맥에 따라) 의미가 달라지는 경우도 뽑기 위함
- 형태소가 적절히 분리되지 않는 경우가 많아 어절 단위로 분리

### [크롤링 데이터 병합]

In [25]:
import os
import pandas as pd
import numpy as np
Youtube_path = "./Youtube_comments.csv"
ilbe_path = "./ilbe_comments.csv"

Comment_df = pd.DataFrame()

df1 = pd.read_csv(Youtube_path, encoding="cp949", header=None, names=[0])
df2 = pd.read_csv(ilbe_path, encoding="cp949", header=None, names=[0])
            
Comment_df = pd.concat([Comment_df, df1])
Comment_df = pd.concat([Comment_df, df2])
        
print(Youtube_path, len(df1), len(Comment_df))
print(ilbe_path, len(df2), len(Comment_df))

# dataset.txt, fword_list.txt 넣기
with open('dataset.txt', 'r', encoding='UTF8') as f:
    dataset_list = []
    i = 0
    while True:
        line = f.readline()
        if line == '' :
            break
        dataset_list.append(line.strip().split('|')[0])
        i += 1
    df3 = pd.Series(dataset_list)
    Comment_df = pd.concat([Comment_df, df3])
    print("dataset.txt", i, len(Comment_df))

with open('fword_list.txt', 'r', encoding='UTF8') as f:
    dataset_list = []
    i = 0
    while True:
        line = f.readline()
        if line == '' :
            break
        dataset_list.append(line)
        i += 1
    df4 = pd.Series(dataset_list)
    Comment_df = pd.concat([Comment_df, df4])
    print("fword_list.txt", i, len(Comment_df))

./Youtube_comments.csv 91281 228250
./ilbe_comments.csv 136969 228250
dataset.txt 5825 234075
fword_list.txt 3577 237652


### [FastText input]

In [26]:
# 중복제거 결과: 237652 -> 219241
Comment_df.drop_duplicates(inplace=True)
print(len(Comment_df))

219241


In [27]:
import re
pattern = re.compile("[^ㄱ-ㅎㅏ-ㅣ가-힣0-9a-zA-Z ]") # 특수문자 제거

# 각 단어에 정규표현식 적용
def clear_word(word):
    word = re.sub(pattern, "", word)
    return word

Comment_df[0] = Comment_df.astype('str')
Comment_df[0] = Comment_df[0].apply(lambda x: clear_word(x))

In [34]:
CHOSUNG_LIST = [u'ㄱ',u'ㄲ',u'ㄴ',u'ㄷ',u'ㄸ',u'ㄹ',u'ㅁ',u'ㅂ',u'ㅃ',u'ㅅ',u'ㅆ',u'ㅇ',u'ㅈ',u'ㅉ',u'ㅊ',u'ㅋ',u'ㅌ',u'ㅍ',u'ㅎ']
JOONGSUNG_LISTS = [u'ㅏ',u'ㅐ',u'ㅑ',u'ㅒ',u'ㅓ',u'ㅔ',u'ㅕ',u'ㅖ',u'ㅗ',u'ㅘ',u'ㅙ',u'ㅚ',u'ㅛ',u'ㅜ',u'ㅝ',u'ㅞ',u'ㅟ',u'ㅠ',u'ㅡ',u'ㅢ',u'ㅣ']
JONGSUNG_LIST = [u'_',u'ㄱ',u'ㄲ',u'ㄳ',u'ㄴ',u'ㄵ',u'ㄶ',u'ㄷ',u'ㄹ',u'ㄺ',u'ㄻ',u'ㄼ',u'ㄽ',u'ㄾ',u'ㄿ',u'ㅀ',u'ㅁ',u'ㅂ',u'ㅄ',u'ㅅ',u'ㅆ',u'ㅇ',u'ㅈ',u'ㅊ',u'ㅋ',u'ㅌ',u'ㅍ',u'ㅎ']

# 자모분리기
def kor_decompose(word, end_char="_"):
    result = []
    
    for char in word:
        char_unicode = ord(char)
        
        if 0xD7A3 < char_unicode or char_unicode < 0xAC00:
            result.append(char)
            continue

        chosung_index = int((((char_unicode - 0xAC00) / 28) / 21) % 19)
        joongsung_index = int(((char_unicode - 0xAC00) / 28) % 21)
        jongsung_index = int((char_unicode - 0xAC00) % 28)
        
        chosung = CHOSUNG_LIST[chosung_index]
        joongsung = JOONGSUNG_LISTS[joongsung_index]
        jongsung = JONGSUNG_LIST[jongsung_index]
        
        # 종성이 없을 경우 end_char
        if jongsung_index == 0:
            jongsung = end_char
        
        result.append(chosung)
        result.append(joongsung)
        result.append(jongsung)

    return "".join(result)

In [32]:
Comment_df.reset_index(inplace=True)
Comment_df.drop('index', axis=1, inplace=True)
Comment_df = Comment_df[1:]

In [36]:
# 자모분리 후 저장
Comment_df[0] = Comment_df[0].apply(lambda x: kor_decompose(x))
Comment_df[0] = Comment_df[0].apply(lambda x: x.split(" "))

In [37]:
Comment_df

Unnamed: 0,0
1,"[ㅁㅣ_ㅇㅕㄴㅇㅣ_, ㄷㅔ_ㅂㅟ_ㅎㅏ_ㅁㅕㄴ, ㅁㅣ_ㅁㅗ_ㄹㅗ_, ㄷㅏ_, ㅆㅣㅂㅇ..."
2,"[ㅋㅠ_ㅂㅡ_ㅇㅑ_, ㅇㅏ_ㅇㅣ_ㄷㅡㄹ, ㅇㅖ_ㄴㅡㅇ, ㅈㅗㅁ, ㅁㅏㄶㅇㅣ_, ㄴㅐ..."
3,"[ㅋㅓㅁㅂㅐㄱㅎㅏㄹ, ㄸㅐ_, ㅁㅏ_ㄷㅏ_, ㅍㅏ_ㅌㅡ_ㅂㅜㄴㅂㅐ_ㄹㅡㄹ, ㅈㅏㄹ,..."
4,"[ㅈㅣㄴㅅㅣㅁ, ㅇㅏ_ㅇㅣ_ㄷㅗㄹ, ㄴㅏ_ㅇㅗ_ㅁㅕㄴ, ㅎㅏㄴㄷㅜ_ㅁㅕㅇㅇㅡㄴ, ㅈ..."
5,"[ㅇㅘ_, ㅈㅗㄴㄴㅏ_, ㅇㅖ_ㅃㅡ_ㄷㅏ_, ㅇㅝㄴㄹㅐ_ㄷㅗ_, ㅇㅖ_ㅃㅓㅆㅈㅣ_ㅁ..."
...,...
219235,"[ㅇ, ㅐㅈㅏ_]"
219236,[ㅆㅣ_ㅂㅜㄹㅇㅏㄹ]
219237,[ㅈㅗㄴㅁㅏ_ㄴㅣ_]
219238,[ㄱㅣ_ㅈㅣㅂㄴㅕㄴ]


In [38]:
# list로 변환
Comment_list = list(Comment_df[0])
len(Comment_list)

219239

In [39]:
# fasttext input
Comment_list[:10]

[['ㅁㅣ_ㅇㅕㄴㅇㅣ_',
  'ㄷㅔ_ㅂㅟ_ㅎㅏ_ㅁㅕㄴ',
  'ㅁㅣ_ㅁㅗ_ㄹㅗ_',
  'ㄷㅏ_',
  'ㅆㅣㅂㅇㅓ_ㅁㅓㄱㅇㅡㄹㅈㅜㄹ',
  'ㅇㅏㄹㅇㅏㅆㄷㅏ_ㄴㅡㄴㄱㅓ_',
  'ㅇㅙ_ㅋㅔ_',
  'ㅇㅜㅅㄱㅕ_ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ'],
 ['ㅋㅠ_ㅂㅡ_ㅇㅑ_',
  'ㅇㅏ_ㅇㅣ_ㄷㅡㄹ',
  'ㅇㅖ_ㄴㅡㅇ',
  'ㅈㅗㅁ',
  'ㅁㅏㄶㅇㅣ_',
  'ㄴㅐ_ㅂㅗ_ㄴㅐ_ㅈㅝ_ㄹㅏ_',
  'ㅇㅗ_ㄴㅡㄹ',
  'ㅇㅏ_ㅎㅕㅇ',
  'ㄴㅓ_ㅁㅜ_',
  'ㅈㅐ_ㅁㅣㅆㅇㅓㅆㄷㅏ_',
  'ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ'],
 ['ㅋㅓㅁㅂㅐㄱㅎㅏㄹ',
  'ㄸㅐ_',
  'ㅁㅏ_ㄷㅏ_',
  'ㅍㅏ_ㅌㅡ_ㅂㅜㄴㅂㅐ_ㄹㅡㄹ',
  'ㅈㅏㄹ',
  'ㅁㅗㅅㅎㅏㄴㄷㅏ_ㄱㅗ_',
  'ㅁㅝ_ㄹㅏ_ㅎㅏ_ㄴㅡㄴㄷㅔ_',
  'ㄱㅡ_ㄹㅜㅂㅇㅔ_',
  'ㅈㅓㄴㅅㅗ_ㅇㅕㄴㅇㅣ_',
  'ㅇㅣㅆㄴㅡㄴ',
  'ㄱㅓㅅㅁㅏㄴㅇㅡ_ㄹㅗ_ㄷㅗ_',
  '',
  'ㅋㅡㄴ',
  'ㅎㅐㅇㅇㅜㄴㅇㅣㅁ',
  'ㄷㅐ_ㅊㅔ_',
  'ㅇㅓ_ㄸㅓㄴ',
  'ㄱㅡ_ㄹㅜㅂㅇㅔ_',
  'ㄹㅣ_ㄷㅓ_ㄱㅏ_',
  'ㅍㅡ_ㄹㅗ_ㄷㅠ_ㅅㅣㅇ',
  'ㅃㅜㄴㅁㅏㄴ',
  'ㅇㅏ_ㄴㅣ_ㄹㅏ_',
  'ㅁㅔㅁㅂㅓ_ㄷㅡㄹ',
  'ㅇㅡㅁㅅㅐㄱ',
  'ㅂㅏㄹㅇㅡㅁㄲㅏ_ㅈㅣ_',
  'ㅅㅣㄴㄱㅕㅇㅆㅓ_ㅅㅓ_',
  'ㄴㅗ_ㄹㅐ_ㄹㅡㄹ',
  'ㅁㅏㄴㄷㅡㄹㅇㅓ_ㅈㅜ_ㄱㅗ_',
  'ㅋㅓㅁㅂㅐㄱㅎㅏㄹㄸㅐ_',
  'ㅁㅏ_ㄷㅏ_',
  'ㅋㅓㄴㅅㅔㅂㅇㅡㄴ',
  'ㅁㅐ_ㅂㅓㄴ',
  'ㄷㅏ_ㄹㅡㄴㄷㅔ_',
  'ㄸㅗ_',
  'ㅁㅔㅁㅂㅓ_ㄷㅡㄹㅇㅣ_ㄹㅏㅇ',
  'ㅇㅓ_ㅇㅜㄹㄹㅣ_ㄱㅔ_',
  'ㅁㅏㄴㄷㅡㄹㅇㅓ_ㅈㅜ_ㄴㅑ_ㄱㅗ_',
  'ㅈㅓㄴㅅㅗ_ㅇㅕㄴㅇㅡㄴ',
  'ㅈㅣㄴㅉㅏ_',
  'ㅊㅚ_ㄱㅗ_ㅇㅢ_',
  'ㄹㅣ_ㄷㅓ_ㄷㅏ_',
  'ㄱㅡ_ㄹㅣ_ㄱㅗ_',
  'ㅈㅓㄴㅅㅗ_ㅇㅕㄴㅇㅢ_',
  'ㅅㅡ_ㅋㅔ_ㅊㅣ_ㄹㅡㄹ',
  'ㅇㅣㅆㄴㅡㄴ',
  'ㄱㅡ_ㄷㅐ_ㄹㅗ_',
  'ㅍㅛ_ㅎㅕㄴㅎㅐ_ㅈㅜ_ㄱㅗ_',
  'ㄷㅟ

### [WordEmbedding]

In [42]:
from gensim.models import FastText

model = FastText(Comment_list, vector_size=50, window=2, min_count=3, workers=4, sg=1, min_n=3, max_n=6, epochs=10)

In [44]:
# 79489
len(model.wv)

79489

In [45]:
model.wv.most_similar(kor_decompose("2022년"))

[('2011ㄴㅕㄴ', 0.9966644048690796),
 ('2014ㄴㅕㄴ', 0.9957617521286011),
 ('2021ㄴㅕㄴ', 0.9943724274635315),
 ('2017ㄴㅕㄴ', 0.9932236671447754),
 ('2022ㄴㅕㄴ', 0.992773711681366),
 ('2012ㄴㅕㄴ', 0.9915316104888916),
 ('2019ㄴㅕㄴ', 0.9905400276184082),
 ('2023ㄴㅕㄴ', 0.9902234077453613),
 ('2009ㄴㅕㄴ', 0.9900348782539368),
 ('2013ㄴㅕㄴ', 0.9894234538078308)]

In [46]:
# 비슷한 욕설이 출력됨
model.wv.most_similar(kor_decompose("개새끼"))

[('ㅁㅜㄴㅅㅐ_ㄲㅣ_', 0.9885829091072083),
 ('ㅅㅂㅅㅐ_ㄲㅣ_', 0.9878286123275757),
 ('ㅉㅏㅇㄱㅐ_ㅅㅐ_ㄲㅣ_', 0.9875786304473877),
 ('ㅈㅟ_ㅅㅐ_ㄲㅣ_', 0.9867092967033386),
 ('ㅂㅅㅅㅐ_ㄲㅣ_', 0.9844736456871033),
 ('ㅄㅅㅐ_ㄲㅣ_', 0.9843753576278687),
 ('ㅆㅣㅂㅅㅐ_ㄲㅣ_', 0.9840155243873596),
 ('ㅇㅐ_ㅅㅐ_ㄲㅣ_', 0.9835335612297058),
 ('ㄲㅏㅇㅍㅐ_ㅅㅐ_ㄲㅣ_', 0.9834245443344116),
 ('ㅇㅑㅇㅋㅟ_ㅅㅐ_ㄲㅣ_', 0.9830461144447327)]

In [47]:
# 욕설이 출력되지 않음
model.wv.most_similar(kor_decompose("18년"))

[('17ㄴㅕㄴ', 0.9978621602058411),
 ('16ㄴㅕㄴ', 0.9973353743553162),
 ('19ㄴㅕㄴ', 0.9970005750656128),
 ('1945ㄴㅕㄴ', 0.9969555735588074),
 ('1965ㄴㅕㄴ', 0.9969369173049927),
 ('36ㄴㅕㄴ', 0.9969033002853394),
 ('08ㄴㅕㄴ', 0.9967043995857239),
 ('21ㄴㅕㄴ', 0.9965476989746094),
 ('09ㄴㅕㄴ', 0.9964456558227539),
 ('15ㄴㅕㄴ', 0.9964216351509094)]

In [48]:
# 비슷한 욕설이 출력됨
model.wv.most_similar(kor_decompose("시발년"))

[('ㅅㅣ_ㅂㅏㄹㄹㅕㄴ', 0.9870353937149048),
 ('ㅆㅣ_ㅂㅏㄹㄴㅕㄴ', 0.9842144846916199),
 ('ㅅㅣ_ㅂㅏㄹㄴㅗㅁ', 0.97516268491745),
 ('ㅅㅣ_ㅂㅏㄹㄹㅗㅁ', 0.9726601839065552),
 ('ㅆㅣ_ㅂㅏㄹㄹㅕㄴ', 0.9715257287025452),
 ('ㄱㅐ_ㅆㅣ_ㅂㅏㄹㄴㅕㄴ', 0.9710908532142639),
 ('ㅅㅣ_ㅂㅏㄹㄹㅓㅁ', 0.9685385823249817),
 ('ㅅㅣ_ㅂㅏㄹㅋㅋ', 0.9665188789367676),
 ('ㄱㅐ_ㅆㅣ_ㅂㅏㄹㄹㅕㄴ', 0.9623039364814758),
 ('ㅆㅣ_ㅂㅓㄹㄴㅕㄴ', 0.9610145092010498)]

In [51]:
# model 저장
model.save("./Fasttext.model")