# 2023-2 언어데이터과학 24강 (2023-11-29) 실습 (3) Word2Vec을 사용한 연령별 단어 분포 비교

## 코퍼스 준비

In [1]:
import pandas as pd
from gensim.models import Word2Vec

In [2]:
DATA_PATH = '../data/nikl/NIKL_OM_form_age_sex.csv.tar.gz'

In [3]:
utterances = pd.read_csv(DATA_PATH, compression='gzip', on_bad_lines='skip')
utterances.dropna(inplace=True)
utterances.rename(columns={utterances.columns[0]: 'form'}, inplace=True)
utterances

Unnamed: 0,form,age,sex
0,안녕하세요,20대,여성
1,️,20대,여성
2,이거 해봐요><,20대,여성
3,"나의 직장인 멘탈 성향은 [안챙겨도 잘커요, 탕비실 선인장] 당신의 멘탈 성향은 ...",20대,여성
4,아앗...,20대,여성
...,...,...,...
2977835,ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ,20대,여성
2977836,아잌ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ,20대,여성
2977837,가즈아,20대,여성
2977838,ㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋㅋ가즈아!,20대,여성


## 코퍼스 가공

In [4]:
utterances['form'] = utterances['form'].apply(str.split)
print(utterances['form'][:5])

0                                              [안녕하세요]
1                                                  [️]
2                                          [이거, 해봐요><]
3    [나의, 직장인, 멘탈, 성향은, [안챙겨도, 잘커요,, 탕비실, 선인장], 당신의...
4                                              [아앗...]
Name: form, dtype: object


## 코퍼스 분할

발화를 발화자의 연령에 따라 분할하기

In [5]:
utterances['age'].value_counts()

age
20대       1593040
30대       1024097
40대 이상     251095
10대        109608
Name: count, dtype: int64

In [6]:
corpus20 = utterances[utterances['age']=='20대']['form'] # EDIT THIS LINE
corpus30 = utterances[utterances['age']=='30대']['form'] # EDIT THIS LINE
corpus40 = utterances[utterances['age']=='40대 이상']['form'] # EDIT THIS LINE
corpus40

10920                       [왜?]
10922                        [왜]
10923                    [티브시청중]
10925                        [왜]
10926                       [허준]
                   ...          
2927346    [name4언니, 벌써, 시작인가봐?]
2927347           [문정은, 추워ㆍ시베리아]
2927348                      [응]
2927349          [오는동안이, 너무, 더워]
2927350       [여러분..모두, 수고했어요!!]
Name: form, Length: 251095, dtype: object

## 코퍼스별 Word2Vec 모델 훈련

In [7]:
d = 100 # dimension of word vectors
L = 2 # window size
k = 5 # negative samples per positive sample

In [8]:
model20 = Word2Vec(sentences=corpus20, sg=1, min_count=5, vector_size=d, window=L, negative=k)
model20.save('../models/word2vec-modu-online-age20s')
# model20 = Word2Vec.load('../models/word2vec-modu-online-age20s')

In [9]:
model30 = Word2Vec(sentences=corpus30, sg=1, min_count=5, vector_size=d, window=L, negative=k)
model30.save('../models/word2vec-modu-online-age30s')
# model30 = Word2Vec.load('../models/word2vec-modu-online-age30s')

In [10]:
model40 = Word2Vec(sentences=corpus40, sg=1, min_count=5, vector_size=d, window=L, negative=k)
model40.save('../models/word2vec-modu-online-age40s')
# model40 = Word2Vec.load('../models/word2vec-modu-online-age40s')

## '아주'와 '완전'의 연령별 분포 비교

In [11]:
w1 = '아주'
w2 = '완전'

In [12]:
model20.wv.most_similar([w1])

[('굉장히', 0.7945592999458313),
 ('매우', 0.7942516803741455),
 ('시원하고', 0.7359247803688049),
 ('훨', 0.7347111105918884),
 ('넘나', 0.7331368923187256),
 ('편하고', 0.7301294207572937),
 ('너무너무', 0.7142390012741089),
 ('너뮤', 0.7124197483062744),
 ('나름', 0.7055812478065491),
 ('좋고', 0.7050605416297913)]

In [13]:
def get_most_similar_words(model, word, topn=30):
    return [w for (w, _) in model.wv.most_similar([word], topn=topn)]

In [14]:
# w1: '아주'
print(get_most_similar_words(model20, w1))
print(get_most_similar_words(model30, w1))
print(get_most_similar_words(model40, w1))

['굉장히', '매우', '시원하고', '훨', '넘나', '편하고', '너무너무', '너뮤', '나름', '좋고', '되게', '보기', '무척', '은근', '훨씬', '참', '여러모로', '상당히', '예쁘고', '시원해서', '귀엽고', '완전', '가격도', '워낙', '짱', '선선하니', '꽤나', '살짝', '의외로', '특유의']
['매우', '넘나', '보기', '훨씬', '나름', '좀더', '훨', '너무너무', '생각보다', '은근', '되게', '시원하고', '편하고', '굉장히', '무척', '확실히', '겨울이', '먹기', '살짝', '참', '시원해서', '은근히', '운이', '가격도', '무지', '완전', '워낙', '몸도', '들어서', '봄']
['몰라요', 'ㅎㅎㅎㅎ', '맞아요..', '역시', '다행이네요', '특히', '요샌', '아무래도', '좋죠', '그것도', '저희도', '멋지네요', '세상에', '둘다', '맛있어요', '그럼요', '신기하네요', '생각보다', '저만', '맛나요', '저두요', '애들이', '그런게', '그렇네요', '고생이', '모르겠어요', '은근', '오호', '헐..', '기분이']


In [15]:
# w2: '완전'
print(get_most_similar_words(model20, w2))
print(get_most_similar_words(model30, w2))
print(get_most_similar_words(model40, w2))

['짱', '넘나', '겁나', '너무너무', '넘', '아주', '왕', '대박', '되게', '디게', '너뮤', '은근', '악', '엄청', '매우', '너무', '진쨔', '굉장히', '왤캐', 'ㅈㄴ', '연기', '으악', '생각만해도', '!!!', '겁내', '헐헐', '왤케', '!!!!', '졸라', '진짜진짜']
['너무너무', '대박', '되게', '넘나', '겁나', '짱', '크', '최고', '굉장히', '은근', '참', '매우', '진짜', '정말', 'ㅜㅜㅜ', '생각만해도', '아주', '와우', '넘', '증말', 'ㅋㅋㅋㅋㅋ아', '무지', 'ㅠㅠㅠㅠ', '!!', '우왕', 'ㅠㅠㅠㅠㅠ', '세상', 'ㅋㅋㅋ진짜', '특히', ',,']
['저도요', '대박', '그것도', 'ㅋㅋㅋㅋㅋ', '저런', '그쵸', '역시', 'ㅎㅎㅎ', '좋네요', '그렇죠', '헐', '웃겨', '그러게요', '아하', '그런가요', '좋네', '다들', '진짜요?', '그렇구나', '앗', '뭐야', '헉', 'ㅎㅎㅎㅎ', '웅', 'ㅋㅋㅋㅋㅋㅋ', '그건', '그러시군요', '벌써', '좋죠', '그래요?']


In [16]:
model20.wv.most_similar(positive=['아주', '완전'])

[('짱', 0.804502010345459),
 ('넘나', 0.7926981449127197),
 ('매우', 0.7735913991928101),
 ('너무너무', 0.7728712558746338),
 ('굉장히', 0.7716690301895142),
 ('되게', 0.750883936882019),
 ('겁나', 0.7481162548065186),
 ('너뮤', 0.7456628084182739),
 ('은근', 0.7372304201126099),
 ('넘', 0.7294827103614807)]

In [17]:
model30.wv.most_similar(positive=['아주', '완전'])

[('매우', 0.8865143656730652),
 ('넘나', 0.8818663358688354),
 ('너무너무', 0.8674823641777039),
 ('되게', 0.8609088659286499),
 ('은근', 0.8397954106330872),
 ('굉장히', 0.8263726830482483),
 ('참', 0.8198205828666687),
 ('겁나', 0.8191911578178406),
 ('보기', 0.8155452609062195),
 ('나름', 0.8118875622749329)]

In [18]:
model40.wv.most_similar(positive=['아주', '완전'])

[('그것도', 0.9728145003318787),
 ('역시', 0.9700822830200195),
 ('ㅎㅎㅎㅎ', 0.9677584171295166),
 ('그쵸', 0.9676728844642639),
 ('저도요', 0.9660922884941101),
 ('좋죠', 0.9605632424354553),
 ('그런가요', 0.9603557586669922),
 ('그러게요', 0.9594154357910156),
 ('뭐야', 0.9593291878700256),
 ('맞아요..', 0.9583175778388977)]

In [19]:
print(get_most_similar_words(model40, '너무'))

['넘', '엄청', '정말', '참', '진짜', '날씨가', '생각보다', '젤', '가을', '가을이', '요새', '아주', '반갑습니다^^', '완전', '겁나', '여기도', '요즘', '날씨', '요즘은', 'ㅠ', '좋아서', '점점', '그나마', '꽤', '기분이', '좋고', '그런가', '선선해지면서', '오네요', '눈이']
