**9주차 실습 - 토픽 모델링**

# 1. 20 뉴스그룹 데이터 준비

- 데이터셋 설명: http://qwone.com/~jason/20Newsgroups/
- scikit-learn 라이브러리: http://scikit-learn.org/0.19/datasets/twenty_newsgroups.html

In [4]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space',
              'comp.sys.ibm.pc.hardware', 'sci.crypt']

# 20 Newsgroups 데이터셋 로드
newsgroups = fetch_20newsgroups(categories=categories, shuffle=True, random_state=42,
                                remove=('headers', 'footers', 'quotes'))

In [5]:
newsgroups.target_names

['alt.atheism',
 'comp.graphics',
 'comp.sys.ibm.pc.hardware',
 'sci.crypt',
 'sci.space',
 'talk.religion.misc']

In [6]:
len(newsgroups.data)

3219

# 2. TF-IDF 행렬 변환

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF vectorizer 생성
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5, max_features=1000)

# 문서를 TF-IDF 행렬로 변환
tfidf_matrix = vectorizer.fit_transform(newsgroups.data)

In [11]:
tfidf_matrix.shape

(3219, 1000)

In [12]:
tfidf_matrix[0]

<1x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 22 stored elements in Compressed Sparse Row format>

# 3. LSA 실행

In [22]:
from sklearn.decomposition import TruncatedSVD

num_topics = 6  # 토픽 수

# Truncated SVD (LSA) 수행
lsa = TruncatedSVD(n_components=num_topics, random_state=42)
lsa_matrix = lsa.fit_transform(tfidf_matrix)

# 단어 feature
terms = vectorizer.get_feature_names_out()

In [23]:
lsa_matrix.shape

(3219, 6)

In [25]:
lsa_matrix[0]

array([ 0.16590928, -0.09624115, -0.06893864, -0.041046  ,  0.00102686,
       -0.06826118])

In [27]:
# 각 토픽에 대한 단어 출력
def print_top_terms(lsa, terms, n_words=10):
    for i, topic in enumerate(lsa.components_):
        # 토픽 내 단어의 중요도 순으로 정렬된 인덱스를 가져옴
        top_terms_idx = topic.argsort()[-n_words:][::-1]
        # 인덱스를 사용하여 실제 단어를 찾아냄
        top_terms = [terms[idx] for idx in top_terms_idx]
        print(f"토픽 {i + 1}: {' '.join(top_terms)}")

print('# LSA 결과:')
print_top_terms(lsa, terms, 10)

# LSA 결과:
토픽 1: just don people know like think does use key god
토픽 2: drive thanks card scsi controller ide bus graphics windows pc
토픽 3: key chip encryption clipper keys government escrow algorithm security public
토픽 4: space nasa shuttle launch orbit moon program earth station lunar
토픽 5: drive scsi space ide controller bus drives disk hard think
토픽 6: god space key drive jesus edu scsi nasa ide data


# 4. LDA 실행

In [33]:
from sklearn.decomposition import LatentDirichletAllocation

# LDA 수행
lda = LatentDirichletAllocation(
    n_components=num_topics,  # 토픽의 수
    max_iter=5,  # 알고리즘 최대 반복 횟수(수렴할 때까지 또는 주어진 반복 횟수까지)
    topic_word_prior=0.1,  # 토픽 내에서 단어의 확률 분포를 제어하는 매개변수(값이 작을수록 토픽 내에서 단어가 더 다양)
    doc_topic_prior=1.0,  # 문서 내 토픽의 분포를 제어하는 매개변수(작을수록 문서가 하나의 토픽에 집중)
    learning_method='online',
    n_jobs=-1,
    random_state=0
)

lda_matrix = lda.fit_transform(tfidf_matrix)



In [34]:
lda_matrix.shape

(3219, 6)

In [35]:
lda_matrix[0]

array([0.11358461, 0.39515489, 0.1045321 , 0.12187066, 0.12917787,
       0.13567986])

In [36]:
print('# LDA 결과:')
print_top_terms(lda, terms, 10)

# LDA 결과:
토픽 1: key drive chip encryption scsi keys public controller clipper hard
토픽 2: god think people don say agree believe objective religion moral
토픽 3: thanks graphics image card pc bus windows program hi files
토픽 4: just know people don point going quite good hear got
토픽 5: space nasa people earth don like nsa ll sun know
토픽 6: space like long actually does interested edu write just maybe


# 5. 토픽 할당

In [37]:
# LSA, LDA를 기반으로 문서에 토픽 할당
def assign_topics(topic_matrix, num_topics):
    return np.argmax(topic_matrix, axis=1) + 1

newsgroups['lsa_topic'] = assign_topics(lsa_matrix, num_topics)
newsgroups['lda_topic'] = assign_topics(lda_matrix, num_topics)

df = pd.DataFrame({'문서': newsgroups.data[:10], 'LSA 토픽': newsgroups['lsa_topic'][:10], 'LDA 토픽': newsgroups['lda_topic'][:10]})
df

Unnamed: 0,문서,LSA 토픽,LDA 토픽
0,\nPardon me? *I* am trying to apply human te...,1,2
1,\n\nPaul-- for the same reason that many other...,1,4
2,I have a Gateway 4DX-33V with my 3.5 inch flop...,1,1
3,\n\nperhaps you can quote just a bit of her ar...,1,2
4,"\nJim, please, that's a lame explanation of th...",6,2
5,\nYou seem to be assuming that all arrests are...,1,2
6,\n\nWe have plenty of computer labs where the ...,1,3
7,Are there any TIFF to anything programs out th...,1,3
8,\n\nThis isn't true. Many people are forced t...,1,4
9,"\n\nI don't think so at first, but solid stat...",1,5
