In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
# 소수점을 포함한 숫자 (floating number)가 과학적 표기법(scientific notation)으로 표기되는 것을 방지하기 
# 위해 numpy에서 제공되는 set_printoptions() 함수가 갖는 suppress 파라미터를 True 설정합니다.
import numpy as np
np.set_printoptions(suppress=True)

In [2]:
CORPUS = [
    'apple banana apple banana orange', 
    'apple orange banana orange', 
    'orange apple apple banana apple',
    'carrot spinach eggplant carrot',
    'spinach carrot potato spinach',
    'carrot potato eggplant eggplant'
]

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
features = vec.fit_transform(CORPUS)

In [7]:
X = np.array(features.todense())
X

array([[2, 2, 0, 0, 1, 0, 0],
       [1, 1, 0, 0, 2, 0, 0],
       [3, 1, 0, 0, 1, 0, 0],
       [0, 0, 2, 1, 0, 0, 1],
       [0, 0, 1, 0, 0, 1, 2],
       [0, 0, 1, 2, 0, 1, 0]], dtype=int64)

In [8]:
vec.get_feature_names_out()

array(['apple', 'banana', 'carrot', 'eggplant', 'orange', 'potato',
       'spinach'], dtype=object)

In [9]:
XTX = np.dot(X.T,X)

In [10]:
XTX.shape

(7, 7)

In [11]:
XTX

array([[14,  8,  0,  0,  7,  0,  0],
       [ 8,  6,  0,  0,  5,  0,  0],
       [ 0,  0,  6,  4,  0,  2,  4],
       [ 0,  0,  4,  5,  0,  2,  1],
       [ 7,  5,  0,  0,  6,  0,  0],
       [ 0,  0,  2,  2,  0,  2,  2],
       [ 0,  0,  4,  1,  0,  2,  5]], dtype=int64)

In [12]:
eigVals, eigVecs = np.linalg.eig(XTX)

In [13]:
eigVals

array([23.22497216,  2.        ,  0.77502784, 12.74456265,  4.        ,
       -0.        ,  1.25543735])

In [14]:
eigvec1 = eigVecs[:, 0]
eigvec2 = eigVecs[:, 3]

In [15]:
eigvec1

array([-0.75511545, -0.48025006,  0.        ,  0.        , -0.44627407,
        0.        ,  0.        ])

In [16]:
eigvec2

array([ 0.        ,  0.        , -0.66353532, -0.48352718,  0.        ,
       -0.30351904, -0.48352718])

In [17]:
doc1 = X[0]

In [18]:
doc1

array([2, 2, 0, 0, 1, 0, 0], dtype=int64)

In [19]:
np.dot(doc1,eigvec1)

-2.917005078618542

In [20]:
np.dot(doc1,eigvec2)

0.0

## gensim을 이용한 LSI

In [19]:
docs_words = [doc.split() for doc in CORPUS]

In [20]:
docs_words

[['apple', 'banana', 'apple', 'banana', 'orange'],
 ['apple', 'orange', 'banana', 'orange'],
 ['orange', 'apple', 'apple', 'banana', 'apple'],
 ['carrot', 'spinach', 'eggplant', 'carrot'],
 ['spinach', 'carrot', 'potato', 'spinach'],
 ['carrot', 'potato', 'eggplant', 'eggplant']]

In [53]:
from gensim.corpora import Dictionary

In [54]:
dictionary = Dictionary(docs_words)

In [55]:
print(dictionary.token2id)

{'apple': 0, 'banana': 1, 'orange': 2, 'carrot': 3, 'eggplant': 4, 'spinach': 5, 'potato': 6}


In [56]:
DTM = []
for doc in docs_words:
    bow = dictionary.doc2bow(doc)
    DTM.append(bow)

In [57]:
DTM

[[(0, 2), (1, 2), (2, 1)],
 [(0, 1), (1, 1), (2, 2)],
 [(0, 3), (1, 1), (2, 1)],
 [(3, 2), (4, 1), (5, 1)],
 [(3, 1), (5, 2), (6, 1)],
 [(3, 1), (4, 2), (6, 1)]]

In [58]:
from gensim.models import LsiModel

In [66]:
model = LsiModel(DTM, num_topics=2, id2word=dictionary) 

In [67]:
model.print_topics()

[(0,
  '0.755*"apple" + 0.480*"banana" + 0.446*"orange" + -0.000*"carrot" + -0.000*"spinach" + -0.000*"eggplant" + 0.000*"potato"'),
 (1,
  '-0.664*"carrot" + -0.484*"eggplant" + -0.484*"spinach" + -0.304*"potato" + 0.000*"banana" + -0.000*"apple" + 0.000*"orange"')]

In [68]:
# 각 주제와 관련이 높은 상위 k 개의 단어만 확인하기
k=3
model.print_topics(num_words=k)

[(0, '0.755*"apple" + 0.480*"banana" + 0.446*"orange"'),
 (1, '-0.664*"carrot" + -0.484*"eggplant" + -0.484*"spinach"')]

In [69]:
model.get_topics()

array([[ 0.75511545,  0.48025006,  0.44627407, -0.        , -0.        ,
        -0.        ,  0.        ],
       [-0.        ,  0.        ,  0.        , -0.66353532, -0.48352718,
        -0.48352718, -0.30351904]])

In [70]:
model[DTM[0]]

[(0, 2.917005078618543)]