<a href="https://colab.research.google.com/github/yeonui-0626/topic-modeling/blob/main/topic_modeling_LDA_TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install konlpy
import pandas as pd
import numpy as np
from konlpy.tag import Komoran
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 

In [2]:
data = pd.read_csv('/content/drive/MyDrive/alba-aptitude.csv', encoding='cp949')
data.head()

Unnamed: 0,alba_name,sub_code,context
0,일반음식점,1010,조리사 및 주방장은 예민한 미각이 필요하며 새로운 음식 메뉴를 개발하기 위해 진취적...
1,레스토랑,1020,"각종 와인의 종류와 맛, 음식과의 궁합에 대해 알고 있어야 하며, 이를 위해 포도의..."
2,패밀리레스토랑,1030,"각종 와인의 종류와 맛, 음식과의 궁합에 대해 알고 있어야 하며, 이를 위해 포도의..."
3,패스트푸드점,1040,"무엇보다 설거지, 청소, 음식재료 다듬기 등의 힘든 일을 잘 견디어낼 수 있는 인내..."
4,치킨ㆍ피자전문점,1050,"무엇보다 설거지, 청소, 음식재료 다듬기 등의 힘든 일을 잘 견디어낼 수 있는 인내..."


In [None]:
data_all=[]

for idx, row in data.iterrows():
  if idx == 9 :
     continue
  data_all.append(row['context'])

data_all

In [5]:
tagger = Komoran()
def get_nouns(text):
    nouns = tagger.nouns(text)
    return [n for n in nouns if len(n) > 1]


# 불용어 지정
stopword=['필요','요구','흥미','성격','능력','사람','때문', '관련', '담당','경우','사항']

In [6]:
# CountVectorizer 생성 
#  ngram_range=(1,1) 로 해서 단어 하나만 뽑히게
vectorizer = CountVectorizer(analyzer='word', tokenizer=get_nouns, min_df=0,
                                    stop_words = stopword, ngram_range=(1,1))

tdm = vectorizer.fit_transform(data_all)

In [7]:
print("(문서 수, 단어 수)")
print(tdm.shape) # (문서 수, 단어 수)

words = vectorizer.get_feature_names()  #단어 목록 저장
count = tdm.sum(axis=0) #열별로 모든 행 합계 -> 단어별로 빈도 구함
print(count.shape)


(문서 수, 단어 수)
(149, 1189)
(1, 1189)


In [8]:
word_count = list(zip(words, count.flat)) #단어와 빈도를 튜플 형식으로 매칭
word_count = sorted(word_count, key=lambda x:x[1], reverse=True) # 내림차순으로 정렬

word_df = pd.DataFrame(word_count, columns=['단어','빈도수'])
# word_df
# word_df.to_csv('words_count.csv')

In [9]:
from sklearn.decomposition import LatentDirichletAllocation

# LDA클래스를 이용해서 피처 벡터화시킨 것을 토픽모델링 시키기
# n_components(토픽개수) 16로 설정 -> 16개의 MBTI와 매칭시키기 위함
lda = LatentDirichletAllocation(n_components=16, random_state=42)
lda.fit(tdm)
# components_속성은 16개의 토픽별(row)로 feature(단어)들의 분포수치(column)를 보여줌
print(lda.components_.shape)
# 행은 토픽, 열은 단어들을 벡터화시킨 feature 들
print(lda.components_)

(16, 1189)
[[0.06250004 0.0625     0.0625     ... 0.0625     0.0625     0.06250001]
 [0.06250003 0.0625     0.0625     ... 0.0625     0.0625     0.0625    ]
 [0.0625     0.0625     0.0625     ... 0.0625     0.0625     2.0625    ]
 ...
 [0.0625     0.0625     0.0625     ... 0.0625     0.0625     0.0625    ]
 [0.06250006 0.0625     0.0625     ... 0.0625     0.0625     0.0625    ]
 [1.08142141 0.0625     0.0625     ... 0.0625     1.0625     3.06249999]]


In [11]:
feature_df = pd.DataFrame(index=range(0,16), columns=['words'])

def display_topic_words(lda_model, feature_names, num_top_words):
  for topic_idx, topic in enumerate(lda_model.components_):
    # print("Topic #", topic_idx+1)

    # topic 별로 모든 단어들 중에서 높은 값 순으로 정렬 후 index를 반환해줌!
    # argsort()는 디폴트가 오름차순임 그래서 [::-1]로 내림차순으로 바꿔줌
    topic_word_idx = topic.argsort()[::-1]
    top_idx = topic_word_idx[:num_top_words]
    feature_concat = ' '.join([str(feature_names[i])+' '+str(round(topic[i], 1))+'\n' for i in top_idx])
    feature_list = ''
    feature_list += ', '.join([str(feature_names[i]) for i in top_idx])
    feature_df.loc[topic_idx,'words'] = feature_list
    

# get_feature_names : 벡터화 시킨 feature(단어)들을 볼 수 있음
feature_names = vectorizer.get_feature_names()
display_topic_words(lda, feature_names, 30)
print(feature_df)

                                                words
0   고객, 지식, 상품, 적극, 서비스, 제품, 관리, 영업, 사회, 예술, 대처, 정...
1   음식, 와인, 체력, 예술, 미각, 정신, 고객, 끈기, 인내심, 서비스, 배려, ...
2   배려, 환자, 판단력, 사회, 분석, 상황, 대처, 의식, 치료, 순발력, 수행, ...
3   작업, 편집, 분석, 책임감, 감각, 신뢰, 판단력, 예술, 지식, 탐구, 사고, ...
4   대인, 관계, 창의력, 실무, 사고, 해결, 기술자, 워드, 문서, 사무, 매체, ...
5   고객, 사회, 주문, 배려, 체력, 관습, 정신, 고객 서비스, 적응력, 장시간, ...
6   예술, 감각, 지식, 장비, 기술, 사회, 리더십, 탐구, 소리, 원활, 제작, 대...
7   작업, 제작, 체력, 재주, 장비, 신체, 건강, 사회, 가구, 업무, 이해, 수행...
8   상품, 분석, 물건, 건강, 논리, 작성, 회계, 지식, 체력, 컴퓨터, 관습, 사...
9   이해, 연구, 탐구, 기술, 관찰력, 과학, 인내심, 사고, 지식, 적용, 공학, ...
10  분석, 사고, 문제, 지식, 논리, 해결, 탐구, 개발, 끈기, 기술, 시스템, 컴...
11  작업, 감각, 인내심, 지식, 보석, 예술, 기계, 활용, 집중력, 컴퓨터, 끈기,...
12  고객, 서비스, 정신, 사회, 현실, 파악, 배려, 관습, 의사소통, 계산, 정직,...
13  학생, 지식, 기술, 전기, 이론, 학습, 지도력, 적극, 교육, 지도, 교수, 유...
14  분석, 시장, 자료, 사고, 소비자, 혁신, 업무, 지식, 대인, 관계, 배려, 의...
15  사회, 지식, 신속, 배려, 상황, 대처, 돌발, 통제, 신체, 정직, 교통, 현실...


In [None]:
feature_df.to_csv('words_of_16topic_count.csv')

In [13]:
# 문서별 토픽들의 분포를 알아보자
# transform 까지 수행하면, 문서별(row)로 토픽들(column)의 분포를 알려줌
doc_topic = lda.transform(tdm)
perplexity = lda.perplexity(tdm, [doc_topic])
print(perplexity)
print(doc_topic.shape)
print(doc_topic[:2])

216.15939775219684
(149, 16)
[[1.02459084e-03 6.44638093e-01 1.02459056e-03 1.02459040e-03
  1.02459030e-03 1.02459122e-03 1.02459044e-03 1.02459036e-03
  1.02459049e-03 1.02459034e-03 1.02459045e-03 1.02459033e-03
  3.41017640e-01 1.02459033e-03 1.02459033e-03 1.02459075e-03]
 [5.16529435e-04 7.81218966e-01 5.16529067e-04 5.16529059e-04
  5.16528975e-04 2.11549627e-01 5.16529175e-04 5.16529060e-04
  5.16529049e-04 5.16529122e-04 5.16529087e-04 5.16529070e-04
  5.16529148e-04 5.16529047e-04 5.16529006e-04 5.16529150e-04]]


In [None]:
job_lst = []
for idx, row in data.iterrows():
    if idx == 9: continue
    job_lst.append(row['alba_name'])

In [None]:
topic_names = ['Topic #'+str(i+1) for i in range(0,16)]
topic_df = pd.DataFrame(data=doc_topic,columns=topic_names, index=job_lst)
print(topic_df.head(20))

               Topic #1  Topic #2  Topic #3  ...  Topic #14  Topic #15  Topic #16
일반음식점          0.001025  0.644638  0.001025  ...   0.001025   0.001025   0.001025
레스토랑           0.000517  0.781219  0.000517  ...   0.000517   0.000517   0.000517
패밀리레스토랑        0.000631  0.739092  0.000631  ...   0.000631   0.000631   0.000631
패스트푸드점         0.000947  0.000947  0.000947  ...   0.000947   0.000947   0.000947
치킨ㆍ피자전문점       0.000947  0.000947  0.000947  ...   0.000947   0.000947   0.000947
커피전문점          0.000893  0.000893  0.000893  ...   0.000893   0.000893   0.000893
아이스크림ㆍ디저트      0.000893  0.000893  0.000893  ...   0.000893   0.000893   0.000893
베이커리ㆍ도넛ㆍ떡      0.001524  0.001524  0.001524  ...   0.001524   0.001524   0.001524
호프ㆍ일반주점        0.000772  0.093922  0.000772  ...   0.000772   0.000772   0.000772
급식ㆍ푸드시스템       0.000906  0.599862  0.000906  ...   0.000906   0.000906   0.000906
도시락ㆍ반찬         0.001563  0.976562  0.001563  ...   0.001563   0.001563   0.001563
백화점ㆍ면세점        0

In [None]:
# topic_df.columns=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
# topic_df = pd.DataFrame(topic_df,columns=[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,'topic'])
# topic_df
for idx, row in topic_df.iterrows():
  max=0
  col=0
  for n in range(0,16):
    if max < row[n]:
      max = row[n]
      col=n
  row['topic'] = col
topic_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,topic
일반음식점,0.001025,0.644638,0.001025,0.001025,0.001025,0.001025,0.001025,0.001025,0.001025,0.001025,0.001025,0.001025,0.341018,0.001025,0.001025,0.001025,1.0
레스토랑,0.000517,0.781219,0.000517,0.000517,0.000517,0.211550,0.000517,0.000517,0.000517,0.000517,0.000517,0.000517,0.000517,0.000517,0.000517,0.000517,1.0
패밀리레스토랑,0.000631,0.739092,0.000631,0.000631,0.000631,0.252070,0.000631,0.000631,0.000631,0.000631,0.000631,0.000631,0.000631,0.000631,0.000631,0.000631,1.0
패스트푸드점,0.000947,0.000947,0.000947,0.000947,0.000947,0.985795,0.000947,0.000947,0.000947,0.000947,0.000947,0.000947,0.000947,0.000947,0.000947,0.000947,5.0
치킨ㆍ피자전문점,0.000947,0.000947,0.000947,0.000947,0.000947,0.985795,0.000947,0.000947,0.000947,0.000947,0.000947,0.000947,0.000947,0.000947,0.000947,0.000947,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
원무ㆍ코디네이터,0.001179,0.001179,0.001179,0.001179,0.001179,0.001179,0.001179,0.001179,0.001179,0.001179,0.001179,0.001179,0.001179,0.001179,0.982311,0.001179,14.0
외래보조ㆍ병동보조,0.001078,0.001078,0.001078,0.001078,0.001078,0.001078,0.001078,0.001078,0.001078,0.001078,0.983836,0.001078,0.001078,0.001078,0.001078,0.001078,10.0
수의테크니션ㆍ동물보건사,0.002976,0.002976,0.002976,0.002976,0.002976,0.002976,0.002976,0.002976,0.002976,0.377284,0.002976,0.002976,0.002976,0.002976,0.002976,0.581049,15.0
실험ㆍ연구보조,0.000134,0.000134,0.000134,0.000134,0.000134,0.000134,0.000134,0.000134,0.000134,0.000134,0.997984,0.000134,0.000134,0.000134,0.000134,0.000134,10.0


In [None]:
topic_df.to_csv("topic-alba_count.csv")