<a href="https://colab.research.google.com/github/yeonui-0626/topic-modeling/blob/main/LDA_based_Recommendation(mbit_alba).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install konlpy
import pandas as pd
import numpy as np
from pprint import pprint
from konlpy.tag import Komoran
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

#### 0. 데이터 로드 및 전처리

* 사용데이터      
data_load : 아르바이트 직종별 적성 및 흥미 크롤링 데이터       
mbti_data : 성격 단어들을 mbti별 TF-IDF 벡터화한 데이터


In [3]:
data_load = pd.read_csv('/content/drive/MyDrive/alba-aptitude.csv', encoding='cp949')
mbti_data = pd.read_csv('/content/drive/MyDrive/mbti-word-weight_1108.csv', index_col=0)
mbti_data.head()

# 알바 이름(alba_name), 직종 코드(sub_code) 저장 
alba_name = []
sub_code = []

for idx, row in data_load.iterrows():
  if idx ==9 :
    continue
  alba_name.append(row['alba_name'])
  sub_code.append(row['sub_code'])
print(alba_name)
print(sub_code)

# 0-1. 직종별 직성 및 흥미 데이터 리스트에 저장
data=[]

for idx, row in data_load.iterrows():
  if idx == 9:
    continue
  data.append(row['context'])

data[:5]

# 0-2. 형태소 분석기 생성
tagger = Komoran()
def get_nouns(text):
    nouns = tagger.nouns(text)
    return [n for n in nouns if len(n) > 1]

# 0-3. 불용어 지정
# -> 성격과 직업을 나타내는 단어는 유지했다. 반복적으로 나타나는 무의미한 단어 제어
# ex. ~ 한 능력이 필요하다. ~가 요구된다.  등등
stopword=['필요','요구','흥미','성격','능력','사람','태도', '특성', '적성','때문', '관련', '담당','경우',
           '사항','불편','이외','계속','절대','서서','특징','기타', '자질', '종사자', '감과']

['일반음식점', '레스토랑', '패밀리레스토랑', '패스트푸드점', '치킨ㆍ피자전문점', '커피전문점', '아이스크림ㆍ디저트', '베이커리ㆍ도넛ㆍ떡', '호프ㆍ일반주점', '급식ㆍ푸드시스템', '도시락ㆍ반찬', '백화점ㆍ면세점', '복합쇼핑몰ㆍ아울렛', '쇼핑몰ㆍ소셜커머스ㆍ홈쇼핑', '유통점ㆍ마트', '편의점', '의류ㆍ잡화매장', '뷰티ㆍ헬스스토어', '휴대폰ㆍ전자기기매장', '가구ㆍ침구ㆍ생활소품', '서점ㆍ문구ㆍ팬시', '약국', '농수산ㆍ청과ㆍ축산', '화훼ㆍ꽃집', '유통ㆍ판매?기타', '놀이공원ㆍ테마파크', '호텔ㆍ리조트ㆍ숙박', '여행ㆍ캠프ㆍ레포츠', '영화ㆍ공연', '전시ㆍ컨벤션ㆍ세미나', '스터디룸ㆍ독서실ㆍ고시원', 'PC방', '노래방', '볼링ㆍ당구장', '스크린?골프ㆍ야구', 'DVDㆍ멀티방ㆍ만화카페', '오락실ㆍ게임장', '이색테마카페', '키즈카페', '찜질방ㆍ사우나ㆍ스파', '피트니스ㆍ스포츠', '공인중개', '골프캐디', '고속도로휴게소', '문화ㆍ여가ㆍ생활?기타', '매장관리ㆍ판매', 'MD', '캐셔ㆍ카운터', '서빙', '주방장ㆍ조리사', '주방보조ㆍ설거지', '바리스타', '안내데스크', '주차관리ㆍ주차도우미', '보안ㆍ경비ㆍ경호', '주유ㆍ세차', '전단지배포', '청소ㆍ미화', '렌탈관리ㆍA/S', '헤어ㆍ미용ㆍ네일샵', '피부관리ㆍ마사지', '반려동물케어', '베이비시터ㆍ가사도우미', '결혼ㆍ연회ㆍ장례도우미', '판촉도우미', '이벤트ㆍ행사스텝', '나레이터모델', '피팅모델', '서비스?기타', '사무보조', '문서작성ㆍ자료조사', '비서', '경리ㆍ회계보조', '인사ㆍ총무', '마케팅ㆍ광고ㆍ홍보', '번역ㆍ통역', '복사ㆍ출력ㆍ제본', '편집ㆍ교정ㆍ교열', '공공기관ㆍ공기업ㆍ협회', '학교ㆍ도서관ㆍ교육기관', '고객상담ㆍ인바운드', '텔레마케팅ㆍ아웃바운드', '금융ㆍ보험영업', '일반영업ㆍ판매', '설문조사ㆍ리서치', '영업관리ㆍ지원', '제조ㆍ가공ㆍ조립', '포장ㆍ품질검사', '입출고ㆍ창고관리', '상하차ㆍ소화물?

#### 1. 단어 벡터화     
* TF-IDF Vectorizer 사용    

 총 알바 직종 : 149개    
 총 단어 수 :  1176개

In [4]:
# TfidfVectorizer 생성 
# ngram_range=(1,1) 로 해서 단어 하나만 뽑히게
# min_df : 0회 미만 나온 단어는 제외 -> 제외 단어 없음
vectorizer = TfidfVectorizer(analyzer='word', tokenizer=get_nouns, min_df=0,
                                    stop_words = stopword, ngram_range=(1,1))

tdm = vectorizer.fit_transform(data)

print(tdm.shape) # (문서 수, 단어 수)

#단어 목록 저장
# get_feature_names : 벡터화 시킨 feature(단어)들을 볼 수 있음
words = vectorizer.get_feature_names_out()  

# 단어별 빈도수 저장
count = tdm.sum(axis=0) #열별로 모든 행 합계 -> 단어별로 빈도 구함
word_count = list(zip(words, count.flat)) #단어와 빈도를 튜플 형식으로 매칭
word_count = sorted(word_count, key=lambda x:x[1], reverse=True) # 내림차순으로 정렬


(149, 1176)


#### 2. LDA 토픽 모델링 모듈 생성

1. LDA 모델 생성
2. 토픽별 단어 가중치 행렬 생성 ( components )
3. 아르바이트별 토픽 가중치 분포 행렬 생성 ( job_topic )
4. mbti별 토픽 가중치 분포 행렬 생성 ( mbti_topic )
5. mbti별 아르바이트 직종 유사도 계산

In [5]:
# 1. LDA 모델 생성
def LDA(topics, iter, batch, tdm):
  # 단어 벡터화 결과를 이용하여 LDA 토픽모델링

  # LDA 모델 생성
  # -  n_components : 토픽개수 설정
  # - max_iter : 반복학습 횟수
  lda = LatentDirichletAllocation(n_components=topics, random_state=42, max_iter=iter, batch_size=batch)
  # LDA 모델 학습 - fit
  lda.fit(tdm)
  return lda


# 2. 토픽별 단어 가중치 행렬 (topic-word-weight)
# components_ : 토픽별 단어 분포수치


# 2-1. 토픽별 단어 저장 ( 가중치 내림차순 )
# num_top_words : 저장할 단어의 수
def display_topic_words(lda_model, words, num_top_words):

  feature_df = pd.DataFrame(index=range(0,topics), columns=['words']) 

  for topic_idx, topic in enumerate(lda_model.components_):
    # topic 별로 모든 단어들 중에서 높은 값 순으로 정렬 후 index를 반환해줌!
    # argsort()는 디폴트가 오름차순임 그래서 [::-1]로 내림차순으로 바꿔줌
    topic_word_idx = topic.argsort()[::-1]
    top_idx = topic_word_idx[:num_top_words]
    # feature_concat=''
    # feature_concat = ' '.join([str(words[i])+','+str(round(topic[i], 1))+',' for i in top_idx])
    feature_list = ''
    feature_list += ', '.join([str(words[i]) for i in top_idx])
    feature_df.loc[topic_idx,'words'] = feature_list

  return feature_df


# 3. 아르바이트별 토픽 가중치 분포 행렬 생성 ( job_topic )
def job_topic_weight(lda_model, tdm, topic_names):
  # 모델 transform
  doc_topic = lda_model.transform(tdm)
  topic_df = pd.DataFrame(data=doc_topic,columns=topic_names)
  return doc_topic, topic_df


# 3-1. 아르바이트 - 토픽 매칭( 아르바이트 그룹화 : 아르바이트별로 분포가 가장 높은 토픽이 매칭 됨 )
def job_top_topic(df):
  top_topic = pd.DataFrame(alba_name, columns=['alba_name'])
  for idx, row in df.iterrows():
    max=0
    col=0
    for n in range(0,topics):
      if max < row[n]:
        max = row[n]
        col=n
    top_topic.loc[idx,'top_topic'] = col
  return top_topic


# 3-2.토픽별 직종 출력
def job_by_topic(top_topic):
  job_groups = []
  for i in range(0,topics):
   group = []
   for idx, row in top_topic.iterrows():
     if i == row['top_topic']:
       group.append(row['alba_name'])
   job_groups.append(group)    
   print(i , group)
  return job_groups


In [6]:
# 4.mbti별 토픽 가중치 분포 행렬 생성 ( mbti_topic )
def mbti_topic_weight(mbti_df,topic_df):
  weight_df = pd.DataFrame(index=mbti_df.index,columns=topic_df.index)
  for mbti, m_row in mbti_df.iterrows():
    word_list=[]
    for i in range(0,len(mbti_df.columns)):
        if m_row[i] > 0.0:
            word_list.append(mbti_df.columns[i])
    for topic, t_row in topic_df.iterrows():
        mul_list = []
        weight = 0
        for word in word_list:
            if word in topic_df.columns:
                if t_row[word] > 0.00 :
                    weight += (t_row[word] * m_row[word])
                    mul_list.append(weight)
        if len(mul_list) == 0:
            weight_df.loc[mbti,topic] = 0
        else:
            weight_df.loc[mbti,topic] = weight / len(mul_list)
  return weight_df

# 5. mbti별 아르바이트 직종 유사도 계산
def similarity(mbti_topic,job_topic,topic):
  result = []
  for mbti,m_row in mbti_topic.iterrows() :

      user_model = m_row[0:]

      sim = [[0, 0.0] for _ in range(149)] # range안에 총 직종 개수만큼 적어주기

      for index, row in job_topic.iterrows():
          sim[index][0] = row['alba_name'] # str
          # print(row[0])
          user_model = np.array(user_model).reshape(1,topic)
          row = row[2:].to_numpy().reshape(1, topic)
          sim[index][1] = cosine_similarity(user_model, row)[0][0]

      sim = sorted(sim, key=lambda x:x[1], reverse = True)
      result.append([mbti]+sim)
  
  return result

#### 3. LDA 토픽 모델링 수행 

In [9]:
iter = 1
batch = 100
topics = 29

print("==========     토픽 수 ", topics,"      =============")
topic_names = [i for i in range(0,topics)]

# 1. LDA 모델 생성
lda_model = LDA(topics, iter, batch, tdm)

# 2. 토픽별 단어 가중치 행렬 ( components )
components = pd.DataFrame(lda_model.components_,columns=words)

# 2-1. 토픽별 단어 ( 가중치 내림차순 )
word_df = display_topic_words(lda_model, words, 30)
# print(word_df)

# 3. 문서별(아르바이트별) 토픽 가중치 분포 행렬 - ( job_topic )
doc_topic, topic_df = job_topic_weight(lda_model, tdm, topic_names)
job_topic = pd.DataFrame(list(zip(alba_name,sub_code)), columns =['alba_name','sub_code'])
job_topic = pd.concat([job_topic,topic_df], axis=1)
# job_topic.to_csv("job_topic.csv",encoding="utf-8-sig")

# 3-1. 아르바이트 - 토픽 매칭( 아르바이트 그룹화 : 아르바이트별로 분포가 가장 높은 토픽이 매칭 됨 )
top_topic = job_top_topic(pd.DataFrame(doc_topic))

# 3-2. 토픽별 직종 출력 
job_by_topic(top_topic)

# 3-3. perplexity 계산
# perplexity = lda_model.perplexity(tdm, [doc_topic])
# print(perplexity)


# 4-0. 데이터 정규화
scaler = MinMaxScaler()
scaler.fit(mbti_data)
mbti_scaled = pd.DataFrame(scaler.transform(mbti_data),index=mbti_data.index,columns=mbti_data.columns)
scaler.fit(components)
components_scaled = pd.DataFrame(scaler.transform(components),index=components.index, columns=components.columns)

# 4.mbti별 topic 가중치 - ( mbti-topic )
mbti_topic_scaled = mbti_topic_weight(mbti_scaled, components_scaled)


# 5-0. 데이터 정규화
scaler.fit(mbti_topic_scaled)
mbti_topic_scaled_scaled = pd.DataFrame(scaler.transform(mbti_topic_scaled),index=mbti_data.index, columns=topic_names)

0 ['쇼핑몰ㆍ소셜커머스ㆍ홈쇼핑', '영화ㆍ공연', '마케팅ㆍ광고ㆍ홍보']
1 ['호프ㆍ일반주점', '급식ㆍ푸드시스템', '호텔ㆍ리조트ㆍ숙박', '렌탈관리ㆍA/S', '베이비시터ㆍ가사도우미', '기계ㆍ전자ㆍ전기', '제품ㆍ산업디자인']
2 ['경리ㆍ회계보조', '금융ㆍ보험영업', '실험ㆍ연구보조']
3 []
4 ['노래방', '볼링ㆍ당구장', '스크린?골프ㆍ야구', '오락실ㆍ게임장', '이색테마카페', '골프캐디', '보안ㆍ경비ㆍ경호']
5 ['DVDㆍ멀티방ㆍ만화카페', '정비ㆍ수리ㆍ설치ㆍA/S']
6 ['주차관리ㆍ주차도우미', '유아ㆍ유치원', '동영상촬영ㆍ편집', '신문ㆍ잡지ㆍ출판']
7 ['피부관리ㆍ마사지', '사무보조', '복사ㆍ출력ㆍ제본', '방송스텝ㆍ촬영보조', '간호조무사ㆍ간호사', '외래보조ㆍ병동보조']
8 ['결혼ㆍ연회ㆍ장례도우미', '포장ㆍ품질검사', '입시ㆍ보습학원', '외국어ㆍ어학원', '예체능?강사', '자격증ㆍ기술학원', '국비교육기관', '수의테크니션ㆍ동물보건사']
9 ['약국', 'MD', '헤어ㆍ미용ㆍ네일샵', '바이럴ㆍSNS마케팅', '레져스포츠?강사', '패션ㆍ잡화디자인']
10 ['공인중개', '고객상담ㆍ인바운드', '시스템ㆍ네트워크ㆍ보안', 'PCㆍ디지털기기?설치ㆍ관리', '컴퓨터ㆍ정보통신']
11 []
12 ['캐셔ㆍ카운터', '청소ㆍ미화', '이벤트ㆍ행사스텝', '설문조사ㆍ리서치', '웹ㆍ모바일기획', '사이트ㆍ콘텐츠?운영', '사진촬영ㆍ편집']
13 ['문서작성ㆍ자료조사', '생산ㆍ건설ㆍ노무?기타', '간병ㆍ요양보호사']
14 ['농수산ㆍ청과ㆍ축산', '키즈카페', 'QAㆍ테스터ㆍ검증']
15 ['서비스?기타', '공사ㆍ건설현장', 'PVC(닥트ㆍ배관설치)']
16 ['유통점ㆍ마트', '편의점', '유통ㆍ판매?기타', '입출고ㆍ창고관리', '상하차ㆍ소화물?분류', '웹ㆍ모바일디자인', '화물ㆍ중장비ㆍ특수차']
17 ['스터디룸ㆍ독서실ㆍ고시원', '판촉도우미', '인사ㆍ총무', '영업관리ㆍ지원']
18 ['복합쇼핑몰ㆍ아

#### MBTI 별 토픽 가중치 행렬

In [13]:
mbti_topic_scaled_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28
ENFJ,0.746947,0.183488,0.708507,0.214239,0.530991,0.379509,0.212215,0.569274,0.609707,0.50879,0.918355,0.421509,0.259565,0.325932,0.359174,0.381058,0.404715,0.695485,1.0,0.34307,0.489003,0.545174,0.11487,0.777108,0.788134,0.174581,0.554214,0.555849,0.942093
ENFP,0.325135,0.197949,0.554365,0.687384,0.299255,0.310737,0.45909,0.137206,0.308334,0.471623,0.662363,0.175029,0.24065,0.336982,0.178183,0.421654,0.600702,0.550157,0.660608,0.181302,0.188509,0.360671,0.170973,0.287377,0.367553,0.338797,0.058822,0.033482,0.347267
ENTJ,0.536452,0.270176,0.519878,0.337726,0.193067,0.612565,0.372202,0.599792,0.863618,0.0,0.851232,0.311601,0.774067,0.90587,0.430674,0.698932,0.0,0.567751,0.703655,0.150821,0.404142,0.150444,0.69138,0.834357,0.567054,0.190262,0.0,0.198057,0.62497
ENTP,0.853424,0.136849,0.604833,0.151673,0.081796,0.316348,0.253712,0.051879,0.728786,0.044768,1.0,0.074395,0.766904,0.455709,0.208064,0.509141,0.449121,0.327485,0.530651,0.200963,0.350852,0.226499,0.294935,0.25946,0.571299,0.212475,0.355728,0.040508,0.577127
ESFJ,0.824782,0.457743,0.0,0.347667,0.613095,1.0,0.503858,0.498202,0.484091,0.082362,0.58783,0.806718,0.673338,0.582655,1.0,0.833572,0.398239,1.0,0.301567,0.388158,0.477552,0.306814,0.719959,0.313634,0.251918,0.0,0.257898,0.0,0.758947


#### 아르바이트별 토픽 가중치 행렬

In [16]:
job_topic.head()

Unnamed: 0,alba_name,sub_code,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28
0,일반음식점,1010,0.004809,0.004809,0.004809,0.004809,0.004809,0.004809,0.004809,0.004809,0.004809,0.004809,0.004809,0.004809,0.004809,0.004809,0.004809,0.004809,0.004809,0.004809,0.004809,0.004809,0.004809,0.004809,0.004809,0.004809,0.004809,0.004809,0.86536,0.004809,0.004809
1,레스토랑,1020,0.00409,0.00409,0.00409,0.00409,0.156182,0.00409,0.00409,0.00409,0.00409,0.00409,0.00409,0.00409,0.00409,0.00409,0.00409,0.00409,0.00409,0.00409,0.00409,0.00409,0.575697,0.00409,0.00409,0.00409,0.00409,0.00409,0.161773,0.00409,0.00409
2,패밀리레스토랑,1030,0.004487,0.004487,0.004487,0.004487,0.185178,0.004487,0.004487,0.004487,0.004487,0.004487,0.004487,0.004487,0.004487,0.004487,0.004487,0.004487,0.004487,0.004487,0.004487,0.004487,0.693664,0.004487,0.004487,0.004487,0.004487,0.004487,0.004487,0.004487,0.004487
3,패스트푸드점,1040,0.004669,0.004669,0.004669,0.004669,0.403431,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.470519,0.004669,0.004669
4,치킨ㆍ피자전문점,1050,0.004669,0.004669,0.004669,0.004669,0.403431,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.004669,0.470519,0.004669,0.004669


#### 4. MBTI별 아르바이트 직종 추천

In [17]:
# 5. mbti별 아르바이트 직종 유사도 계산
rec_job = similarity(mbti_topic_scaled_scaled,job_topic,topics)
rec_job_df = pd.DataFrame(data = rec_job, columns = ['mbti']+[i for i in range(149)])

# mbti 별 유사도 높은 상위  5개의 알바 직종 출력 -> MBTI 별 아르바이트 추천 결과
n = 1
rec_num = 5
for mbti in rec_job:
    print(n,")",mbti[0], "\t")
    n += 1
    for i in range(1, rec_num+1) :
        print(mbti[i][0], end = "  |  ")
    print("\n")

1 ) ENFJ 	
생동성ㆍ임상시험  |  의류ㆍ잡화매장  |  일반영업ㆍ판매  |  오락실ㆍ게임장  |  컴퓨터ㆍ정보통신  |  

2 ) ENFP 	
유통ㆍ판매?기타  |  실험ㆍ연구보조  |  일반영업ㆍ판매  |  의류ㆍ잡화매장  |  오락실ㆍ게임장  |  

3 ) ENTJ 	
문서작성ㆍ자료조사  |  컴퓨터ㆍ정보통신  |  포장ㆍ품질검사  |  미디어 전체  |  학교ㆍ도서관ㆍ교육기관  |  

4 ) ENTP 	
컴퓨터ㆍ정보통신  |  영화ㆍ공연  |  PCㆍ디지털기기?설치ㆍ관리  |  공인중개  |  고객상담ㆍ인바운드  |  

5 ) ESFJ 	
스터디룸ㆍ독서실ㆍ고시원  |  DVDㆍ멀티방ㆍ만화카페  |  정비ㆍ수리ㆍ설치ㆍA/S  |  생동성ㆍ임상시험  |  키즈카페  |  

6 ) ESFP 	
미디어 전체  |  PC방  |  문화ㆍ여가ㆍ생활?기타  |  영화ㆍ공연  |  반려동물케어  |  

7 ) ESTJ 	
학교ㆍ도서관ㆍ교육기관  |  반려동물케어  |  실험ㆍ연구보조  |  유통점ㆍ마트  |  경리ㆍ회계보조  |  

8 ) ESTP 	
베이커리ㆍ도넛ㆍ떡  |  오락실ㆍ게임장  |  PC방  |  문화ㆍ여가ㆍ생활?기타  |  미디어 전체  |  

9 ) INFJ 	
대리운전ㆍ일반운전  |  화물ㆍ중장비ㆍ특수차  |  미디어 전체  |  키즈카페  |  택시ㆍ버스운전  |  

10 ) INFP 	
신문ㆍ잡지ㆍ출판  |  영화ㆍ공연  |  동영상촬영ㆍ편집  |  유아ㆍ유치원  |  대리운전ㆍ일반운전  |  

11 ) INTJ 	
실험ㆍ연구보조  |  컴퓨터ㆍ정보통신  |  PCㆍ디지털기기?설치ㆍ관리  |  정비ㆍ수리ㆍ설치ㆍA/S  |  공인중개  |  

12 ) INTP 	
학교ㆍ도서관ㆍ교육기관  |  반려동물케어  |  서비스?기타  |  PVC(닥트ㆍ배관설치)  |  컴퓨터ㆍ정보통신  |  

13 ) ISFJ 	
DVDㆍ멀티방ㆍ만화카페  |  정비ㆍ수리ㆍ설치ㆍA/S  |  오락실ㆍ게임장  |  간병ㆍ요양보호사  |  간호조무