# 잠재 의미 분석(Latent Semantic Analysis, LSA)
---

- LSA는 토픽 모델링에 아이디어를 제공한 알고리즘.
- DTM의 잠재적 의미를 이끌어내는 방법.
- 쉽고 빠르게 구현 가능하지만 새로운 정보에 대한 업데이트가 어려움
- 선형대수학의 특이값 분해(SVD)가 필요.

# 특이값 분해(Singular Value Decomposition, SVD)
---

- A가 m x n 행렬일 때, 3개 행렬의 곱으로 분해하는 것.
$$ A = U\Sigma V^T $$
> 각 3개 행렬은 다음을 만족한다.
  - U : m x m 직교 행렬 
  $$ AA^T = U( \Sigma \Sigma^T)U^T $$
  - V : n x n 직교 행렬
  $$ A^TA = V(\Sigma^T \Sigma)V^T $$
  - S : m x n 직사각 행렬
  - 직교 행렬(orthogonal matrix) : 자신과 자신의 전치 행렬의 곱 또는 이를 반대로 한 곱한 결과가 단위행렬이 되는 행렬
  - 대각 행렬(diagonal matrix) : 주대각선을 제외한 곳의 모든 원소가 0인 행렬
- LSA에 사용할 때에는 절단된 SVD(truncated SVD)가 사용 된다.
  - U 행렬과 V행렬의 t열 까지만 남기게 된다. ->  t는 찾고자 하는 토픽의 수(hyperparameter)
  - 계산 비용이 낮아질 뿐만 아니라, 중요하지 않은 정보를 삭제하는 효과를 가진다.

# LSA implementation
---

- DTM이나 TF-IDF행렬에 truncated SVD를 사용해서 차원을 축소하고, 단어들의 잠재적인 의미를 이끌어낸다.

In [0]:
import numpy as np

In [0]:
# 전에 사용했던 dtm을 활용
dtm = np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
                [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1]])

In [6]:
dtm.shape

(13, 44)

In [7]:
# Full SVD
U,s,VT = np.linalg.svd(dtm, full_matrices=True)
print(U.round(2))
np.shape(U) # 13 x 13 의 직교행렬 U

[[-0.29 -0.49  0.35 -0.08  0.15 -0.24  0.36 -0.08  0.09  0.22  0.19  0.49
  -0.03]
 [-0.09 -0.13  0.19  0.22  0.06  0.06  0.39  0.26 -0.44  0.09 -0.62 -0.24
  -0.15]
 [-0.44 -0.08 -0.24 -0.52  0.45  0.34 -0.19  0.24  0.09  0.18 -0.1  -0.11
  -0.09]
 [-0.27  0.28  0.1   0.18  0.12  0.12 -0.21 -0.18 -0.38 -0.23  0.16  0.3
  -0.62]
 [-0.25 -0.34  0.34 -0.01  0.07 -0.04 -0.18 -0.3  -0.17 -0.33  0.23 -0.6
   0.17]
 [-0.15 -0.22  0.12  0.56 -0.06  0.55 -0.14  0.32  0.34 -0.15  0.05  0.13
   0.13]
 [-0.31 -0.13 -0.21  0.29 -0.21 -0.14 -0.44 -0.4  -0.03  0.49 -0.31  0.05
   0.09]
 [-0.28  0.23 -0.16  0.12  0.3  -0.25 -0.02  0.13 -0.26 -0.34 -0.1   0.29
   0.62]
 [-0.34 -0.16 -0.49 -0.14 -0.54  0.21  0.4  -0.13 -0.13 -0.25  0.1   0.03
   0.01]
 [-0.26  0.   -0.31  0.32  0.08 -0.55  0.08  0.33  0.3  -0.05  0.17 -0.3
  -0.31]
 [-0.19  0.34  0.16  0.07 -0.17  0.11  0.1   0.26 -0.31  0.53  0.51 -0.16
   0.21]
 [-0.29  0.51  0.21  0.06  0.11  0.12  0.38 -0.41  0.46  0.01 -0.17 -0.14
   0.05]
 [-0.28

(13, 13)

In [9]:
print(s.round(2))
print(np.shape(s)) # 특이값의 리스트를 반환하므로 다시 대각 행렬로 바꿔준다
S = np.zeros((13,44))
S[:13,:13] = np.diag(s) # 특이값을 대각행렬에 삽입
print(S.round(2))
print(np.shape(S))

[7.12 3.88 3.45 2.86 2.38 2.22 2.15 1.97 1.69 1.61 1.46 1.23 1.  ]
(13,)
[[7.12 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   3.88 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.   3.45 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.   0.   2.86 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.   0.   0.   2.38 0.   0. 

In [10]:
print(VT.round(2))
np.shape(VT)

[[-0.05 -0.11 -0.04 ... -0.04 -0.04 -0.04]
 [-0.16 -0.3  -0.13 ...  0.04  0.04  0.04]
 [ 0.16  0.29  0.1  ...  0.12  0.12  0.12]
 ...
 [ 0.1   0.01 -0.05 ...  0.84 -0.16 -0.16]
 [ 0.1   0.01 -0.05 ... -0.16  0.84 -0.16]
 [ 0.1   0.01 -0.05 ... -0.16 -0.16  0.84]]


(44, 44)

In [11]:
np.allclose(dtm,np.dot(np.dot(U,S),VT).round(2)) # 기존 행렬(dtm)과 SVD 3행렬의 곱이 같은지 확인

True

In [12]:
# Truncated SVD
# 상위 2개만 남기고 제거
S = S[:2,:2]
print(S.round(2))
U = U[:,:2]
print(U.round(2))
VT = VT[:2,:]
print(VT.round(2))

[[7.12 0.  ]
 [0.   3.88]]
[[-0.29 -0.49]
 [-0.09 -0.13]
 [-0.44 -0.08]
 [-0.27  0.28]
 [-0.25 -0.34]
 [-0.15 -0.22]
 [-0.31 -0.13]
 [-0.28  0.23]
 [-0.34 -0.16]
 [-0.26  0.  ]
 [-0.19  0.34]
 [-0.29  0.51]
 [-0.28  0.16]]
[[-0.05 -0.11 -0.04 -0.25 -0.08 -0.04 -0.14 -0.18 -0.27 -0.42 -0.48 -0.01
  -0.29 -0.06 -0.33 -0.06 -0.06 -0.11 -0.23 -0.14 -0.11 -0.14 -0.14  0.
  -0.02 -0.02 -0.02 -0.04 -0.04 -0.08 -0.05 -0.05 -0.05 -0.05 -0.04 -0.04
  -0.07 -0.03 -0.04 -0.04 -0.04 -0.04 -0.04 -0.04]
 [-0.16 -0.3  -0.13 -0.28 -0.21 -0.13 -0.23 -0.19 -0.26 -0.   -0.   -0.03
   0.5  -0.02  0.26 -0.02 -0.02 -0.06 -0.03  0.11 -0.06 -0.1   0.33  0.
  -0.06 -0.06 -0.06 -0.03 -0.03  0.06 -0.04 -0.04 -0.04 -0.04  0.    0.
   0.13  0.09  0.13  0.13  0.13  0.04  0.04  0.04]]


In [14]:
dtm_prime = np.dot(np.dot(U,S),VT)
print(dtm_prime.round(2))
dtm_prime.shape

[[ 0.41  0.8   0.32  1.04  0.56  0.32  0.73  0.73  1.05  0.87  1.    0.09
  -0.36  0.17  0.2   0.17  0.17  0.34  0.54  0.08  0.34  0.48 -0.33  0.
   0.15  0.15  0.15  0.15  0.15  0.04  0.17  0.17  0.17  0.17  0.08  0.08
  -0.11 -0.11 -0.17 -0.17 -0.17  0.    0.    0.  ]
 [ 0.11  0.22  0.09  0.3   0.16  0.09  0.21  0.21  0.31  0.28  0.32  0.03
  -0.06  0.05  0.09  0.05  0.05  0.1   0.17  0.04  0.1   0.14 -0.07  0.
   0.04  0.04  0.04  0.04  0.04  0.02  0.05  0.05  0.05  0.05  0.02  0.02
  -0.02 -0.03 -0.04 -0.04 -0.04  0.    0.    0.  ]
 [ 0.22  0.44  0.17  0.87  0.31  0.17  0.51  0.62  0.92  1.33  1.52  0.05
   0.74  0.2   0.97  0.2   0.2   0.36  0.73  0.4   0.36  0.47  0.35  0.
   0.08  0.08  0.08  0.15  0.15  0.22  0.16  0.16  0.16  0.16  0.12  0.12
   0.16  0.06  0.09  0.09  0.09  0.11  0.11  0.11]
 [-0.07 -0.12 -0.06  0.19 -0.08 -0.06  0.01  0.13  0.23  0.82  0.94 -0.01
   1.11  0.1   0.93  0.1   0.1   0.15  0.41  0.39  0.15  0.16  0.65  0.
  -0.02 -0.02 -0.02  0.05  0.05  0.21  0.

(13, 44)