In [2]:
import pandas as pd
from gensim.models.word2vec import Word2Vec
import warnings
warnings.filterwarnings('ignore')

In [3]:
import re
from konlpy.tag import Mecab
from tqdm import tqdm

### CSV to DataFrame

In [4]:
df = pd.read_csv("/Users/suchan/study/파이널 프로젝트/contents_Watcha.csv")

### 전처리 함수 및 실행

#### 함수

In [30]:
def clean_text(text):
    text = text.replace(".", "").strip()
    text = text.replace("·", " ").strip()
    pattern = '[^ ㄱ-ㅣ가-힣|0-9]+'
    text = re.sub(pattern, '', text)
    return text

def get_nouns(tokenizer, sentence):
    tagged = tokenizer.pos(sentence)
    stop_words = ['드라마','정도','사람','생각','의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다', '무척', '부탁', '인간']
    nouns = [word for word, tag in tagged if tag in ['NNG', 'NNP'] and len(word) >1 and word not in stop_words]
    return nouns

def tokenize(df):
    tokenizer = Mecab()
    processed_data = []
    for sent in tqdm(df['reviews']):
        sentence = clean_text(str(sent).replace("\n", "").strip())
        processed_data.append(get_nouns(tokenizer, sentence))
    return processed_data

#### 실행 및 DataFrame에 추가

In [31]:
preprocessed_data = tokenize(df)

100%|██████████| 91249/91249 [00:09<00:00, 10128.74it/s]


In [32]:
df['preprocessed_reviews'] = preprocessed_data

In [33]:
df['preprocessed_reviews']

0                      [캐스팅, 라인업, 노희경, 극본, 몰입, 차승원]
1                             [침묵, 순간, 노희경, 극본, 경지]
2                            [세상, 노희경, 작가, 배우, 스토리]
3                                  [배우, 한수, 위험, 고통]
4                                     [특별, 저마다, 사정]
                            ...                    
91244                                         [주인공]
91245    [신박, 이야기, 시작, 안정, 신인, 작가, 박수, 승연, 보결, 재발견]
91246        [진짜, 세상, 대리, 감정, 승연, 영화, 아일랜드, 투명, 눈길]
91247                                      [승연, 연기]
91248                         [소재, 중간, 도달, 스토리, 예상]
Name: preprocessed_reviews, Length: 91249, dtype: object

### Word2Vec

In [41]:
model = Word2Vec(sentences = preprocessed_data, vector_size = 200, window = 3, min_count = 5, workers = -1, sg = 0)

In [42]:
model.wv.vectors.shape

(10909, 200)

In [43]:
print(model.wv.most_similar("이야기"))

[('해자', 0.24927730858325958), ('첫눈', 0.2415984570980072), ('사필귀정', 0.24116162955760956), ('화풀이', 0.2329683005809784), ('신원호', 0.23137734830379486), ('이때', 0.22117801010608673), ('연녀', 0.22106102108955383), ('평화', 0.21906854212284088), ('박동훈', 0.21629111468791962), ('다인', 0.2136673480272293)]
