# Latent Semantic Analysis(LSA)

    DTM의 잠재 의미를 이끌어내는 방법.
    선형대수의 특이값 분해(SVD) 를 이용한다.

### Singular Value Decomposition(SVD)
    알아야 할 것.
    SVD 란 A 가 m x n 행렬일 때, 3개의 행렬의 곱으로 분해하는 것.
    
      
   $$ A = U \sum V^T $$
       
    U = m x m 직교행렬(orthogonal)
    V = n x n 직교행렬(orthogonal)
    ∑ = m x n 직사각 대각행렬(diagonal)

    
### Truncated SVD
    
    토픽의 수를 반영한 t 값.
    t 열 까지만 남겨놓는다.

In [9]:
import pandas as pd
import numpy as np
V=np.array([[0,0,0,1,0,1,1,0,0],[0,0,0,1,1,0,1,0,0],[0,1,1,0,2,0,0,0,0],[1,0,0,0,0,0,0,1,1]])
R=np.array(['문서1','문서2','문서3','문서3'])
C=np.array(['과일이','길고','노란','먹고','바나나','사과','싶은','저는','좋아요'])

pd.DataFrame(V,R,C) # DTM 

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
문서1,0,0,0,1,0,1,1,0,0
문서2,0,0,0,1,1,0,1,0,0
문서3,0,1,1,0,2,0,0,0,0
문서3,1,0,0,0,0,0,0,1,1


#### Full SVD 


In [10]:
# 변수명을 ∑ 대신 s 사용
U, s, VT = np.linalg.svd(A,full_matrices=True)

In [12]:
print(U.round(2))
print(U.shape)

[[-0.24  0.75  0.   -0.62]
 [-0.51  0.44 -0.    0.74]
 [-0.83 -0.49 -0.   -0.27]
 [-0.   -0.    1.    0.  ]]
(4, 4)


In [13]:
print(s.round(2))
print(s.shape)

[2.69 2.05 1.73 0.77]
(4,)


In [14]:
# 특이값 리스트 -> 대각 행렬

S = np.zeros((4,9))
S[:4,:4] = np.diag(s)
print(S.round(2))
print(S.shape)

[[2.69 0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.   2.05 0.   0.   0.   0.   0.   0.   0.  ]
 [0.   0.   1.73 0.   0.   0.   0.   0.   0.  ]
 [0.   0.   0.   0.77 0.   0.   0.   0.   0.  ]]
(4, 9)


In [15]:
print(VT.round(2))
print(VT.shape)

[[-0.   -0.31 -0.31 -0.28 -0.8  -0.09 -0.28 -0.   -0.  ]
 [ 0.   -0.24 -0.24  0.58 -0.26  0.37  0.58 -0.   -0.  ]
 [ 0.58 -0.    0.    0.   -0.    0.   -0.    0.58  0.58]
 [ 0.   -0.35 -0.35  0.16  0.25 -0.8   0.16 -0.   -0.  ]
 [-0.   -0.78 -0.01 -0.2   0.4   0.4  -0.2   0.    0.  ]
 [-0.29  0.31 -0.78 -0.24  0.23  0.23  0.01  0.14  0.14]
 [-0.29 -0.1   0.26 -0.59 -0.08 -0.08  0.66  0.14  0.14]
 [-0.5  -0.06  0.15  0.24 -0.05 -0.05 -0.19  0.75 -0.25]
 [-0.5  -0.06  0.15  0.24 -0.05 -0.05 -0.19 -0.25  0.75]]
(9, 9)


#### Truncated SVD
    t = 2

In [16]:
S = S[:2,:2]
print(S.round(2))

[[2.69 0.  ]
 [0.   2.05]]


In [17]:
U = U[:,:2]
print(U.round(2))

[[-0.24  0.75]
 [-0.51  0.44]
 [-0.83 -0.49]
 [-0.   -0.  ]]


In [18]:
VT = VT[:2,:]
print(VT.round(2))

[[-0.   -0.31 -0.31 -0.28 -0.8  -0.09 -0.28 -0.   -0.  ]
 [ 0.   -0.24 -0.24  0.58 -0.26  0.37  0.58 -0.   -0.  ]]


### Practice

    Twenty Newsgroups 데이터 활용
    각 토픽당 가장 중요한 단어 5개 출력하기

In [22]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

dataset = fetch_20newsgroups(shuffle=True,random_state=1,remove=('headers','footers','quotes'))
documents = dataset.data
len(documents)

11314

#### 텍스트 전처리

In [23]:
news_df = pd.DataFrame({'document':documents})
# 특수 문자 제거
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
# 길이가 3이하인 단어는 제거 (길이가 짧은 단어 제거)
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# 전체 단어에 대한 소문자 변환
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

  news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")


In [25]:
news_df['clean_doc'][1]

'yeah expect people read actually accept hard atheism need little leap faith jimmy your logic runs steam sorry pity sorry that have these feelings denial about faith need well just pretend that will happily ever after anyway maybe start newsgroup atheist hard bummin much forget your flintstone chewables bake timmons'

In [26]:
# 불용어 제거
from nltk.corpus import stopwords

stopWords = stopwords.words('english')


tokenized_doc = news_df['clean_doc'].apply(lambda x : x.split())
tokenized_doc = tokenized_doc.apply(lambda x : [item for item in x if item not in stopWords])


In [27]:
print(tokenized_doc[1])

['yeah', 'expect', 'people', 'read', 'actually', 'accept', 'hard', 'atheism', 'need', 'little', 'leap', 'faith', 'jimmy', 'logic', 'runs', 'steam', 'sorry', 'pity', 'sorry', 'feelings', 'denial', 'faith', 'need', 'well', 'pretend', 'happily', 'ever', 'anyway', 'maybe', 'start', 'newsgroup', 'atheist', 'hard', 'bummin', 'much', 'forget', 'flintstone', 'chewables', 'bake', 'timmons']


In [28]:
# TF-IDF 행렬 만들기
# TfidfVectorizer 사용하기 위해 역토큰화 작업

detokenized_doc = []
for i in range(len(news_df)):
    t = " ".join(tokenized_doc[i])
    detokenized_doc.append(t)
    
news_df['clean_doc'] = detokenized_doc

In [29]:
news_df['clean_doc'][1]

'yeah expect people read actually accept hard atheism need little leap faith jimmy logic runs steam sorry pity sorry feelings denial faith need well pretend happily ever anyway maybe start newsgroup atheist hard bummin much forget flintstone chewables bake timmons'

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
# 1,000단어로 제한
vectorizer = TfidfVectorizer(stop_words='english',
                             max_features=1000,
                             max_df= 0.5,
                             smooth_idf=True)
X = vectorizer.fit_transform(news_df['clean_doc'])
X.shape

(11314, 1000)

In [31]:
from sklearn.decomposition import TruncatedSVD
# n_components 가 토픽 수
SVD = TruncatedSVD(n_components=20,algorithm='randomized',n_iter=100,random_state=122)
SVD.fit(X)
len(SVD.components_) # LSA 에서 VT

20

In [33]:
# 20개의 행의 각 1000개 열 중 가장 값이 큰 5개 출력.
terms = vectorizer.get_feature_names() # 단어 집합

def getTopics(components,feature_names,n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1),[(feature_names[i],topic[i].round(5)) for i in topic.argsort()[:-n-1:-1]])
getTopics(SVD.components_,terms)

Topic 1: [('like', 0.21386), ('know', 0.20046), ('people', 0.19293), ('think', 0.17805), ('good', 0.15128)]
Topic 2: [('thanks', 0.32888), ('windows', 0.29088), ('card', 0.18069), ('drive', 0.17455), ('mail', 0.15111)]
Topic 3: [('game', 0.37064), ('team', 0.32443), ('year', 0.28154), ('games', 0.2537), ('season', 0.18419)]
Topic 4: [('drive', 0.53324), ('scsi', 0.20165), ('hard', 0.15628), ('disk', 0.15578), ('card', 0.13994)]
Topic 5: [('windows', 0.40399), ('file', 0.25436), ('window', 0.18044), ('files', 0.16078), ('program', 0.13894)]
Topic 6: [('chip', 0.16114), ('government', 0.16009), ('mail', 0.15625), ('space', 0.1507), ('information', 0.13562)]
Topic 7: [('like', 0.67086), ('bike', 0.14236), ('chip', 0.11169), ('know', 0.11139), ('sounds', 0.10371)]
Topic 8: [('card', 0.46633), ('video', 0.22137), ('sale', 0.21266), ('monitor', 0.15463), ('offer', 0.14643)]
Topic 9: [('know', 0.46047), ('card', 0.33605), ('chip', 0.17558), ('government', 0.1522), ('video', 0.14356)]
Topic 10

## LSA 의 장단점
    장 : 
        쉽고 빠르게 구현이 가능하다.
        문서 유사도 계산 등에서 좋은 성능을 보인다.
    단 : 
        LSA에 새로운 데이터를 추가하여 계산하려면 처음부터 다시 해야댐.
        -> 새로운 정보 업데이트 어려움.
       => LSA 대신 Word2Vec 등 인공 신경망 기반의 방법론 각광받음.