In [1]:
'''
[NMF 알고리즘을 이용한 유사한 문서 검색과 구현](https://bcho.tistory.com/tag/Matrix%20factorization)
'''

'\n[NMF 알고리즘을 이용한 유사한 문서 검색과 구현](https://bcho.tistory.com/tag/Matrix%20factorization)\n'

In [2]:
# 행렬 인수분해

In [8]:
from sklearn.datasets import fetch_20newsgroups
from io import StringIO
import pandas as pd

newsgroups_train = fetch_20newsgroups(subset='train')

def parseDocument(data):
    buf = StringIO(data)
    line=buf.readline()
    data=[]
    subject=''
    while line:
        if(line.startswith('Subject:')):
            subject = line[8:].strip()
        elif (line.startswith('Lines:')):
               lines = line[6:]
               while line :
                    line = buf.readline()
                    data.append(line)
        line=buf.readline()
    text = ''.join(data)
    
    return subject,text


textlist = []
df = pd.DataFrame(columns=['text'])
for data in newsgroups_train.data[0:1000]:
    subject,text = parseDocument(data)
    df.loc[subject]=text
df.head()

Unnamed: 0,text
WHAT car is this!?,\n I was wondering if anyone out there could e...
SI Clock Poll - Final Call,NNTP-Posting-Host: carson.u.washington.edu\n\n...
PB questions...,"\nwell folks, my mac plus finally gave up the ..."
Re: Weitek P9000 ?,Distribution: world\nNNTP-Posting-Host: amber....
Re: Shuttle Launch Question,\nIn article <15APR199320340428@stdvax> abdkw@...


In [10]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df['text'].tolist())

print(vectors.shape)

(863, 29183)


In [15]:
# NMF
# NMF를 이용하여 본문에서 특성 추출
# 문서가 tf-idf를 이용하여 벡터화가 되었으면, NMF를 이용하여, 각 문서별로 특성을 추출한다.
# NMF역시 sklearn에 NMF라는 모듈로 미리 정의되어 있다. 단지, 몇개의 특징으로 압축을 해낼것인지만 지정하면 되는데, 
# 여기서는 n_components 인자를 이용하여 총 40개의 특징으로 특성을 압축하여 추출하도록 하였다.
from sklearn.decomposition import NMF

vector_array = vectors.toarray()
nmf = NMF(n_components=40)
nmf.fit(vector_array)
features = nmf.transform(vector_array)

In [17]:
print(features[0:2])

[[0.         0.         0.         0.         0.         0.02814205
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.00489334 0.27816741 0.         0.00039807 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.00191348 0.         0.
  0.         0.         0.         0.        ]
 [0.017112   0.00357931 0.         0.00105354 0.00981261 0.03423957
  0.03063488 0.00471921 0.00853192 0.         0.00052172 0.
  0.00514622 0.         0.         0.01322241 0.         0.00107501
  0.         0.         0.00048171 0.         0.         0.
  0.         0.         0.         0.         0.03547357 0.
  0.         0.         0.0091775  0.         0.         0.01668633
  0.         0.00674724 0.         0.        ]]


In [16]:
# Normalize extracted feature set
# 피쳐 정규화
# 추출된 피쳐가 피쳐마다 또는 문서마다 변화의 폭이 클 수 있기 때문에, 
# Normalizer를 이용하여 0~1사이로 스케일링을 한다. 이 정규화 역시 간단하게 아래와 같이 Normalizer 모듈을 이용하면 된다. 
from sklearn.preprocessing import Normalizer

normalizer = Normalizer()
norm_features=normalizer.fit_transform(features)

print(norm_features[0:2])

[[0.         0.         0.         0.         0.         0.10063779
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.0174989  0.99474468 0.         0.00142351 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.00684272 0.         0.
  0.         0.         0.         0.        ]
 [0.25570293 0.05348536 0.         0.01574302 0.14662883 0.51163866
  0.45777411 0.07051868 0.12749164 0.         0.00779605 0.
  0.0768994  0.         0.         0.19758127 0.         0.0160638
  0.         0.         0.00719817 0.         0.         0.
  0.         0.         0.         0.         0.53007813 0.
  0.         0.         0.13713855 0.         0.         0.24934221
  0.         0.10082337 0.         0.        ]]


In [20]:
df_features = pd.DataFrame(norm_features,index=df.index.tolist())
df_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
WHAT car is this!?,0.0,0.0,0.0,0.0,0.0,0.100638,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.006843,0.0,0.0,0.0,0.0,0.0,0.0
SI Clock Poll - Final Call,0.255703,0.053485,0.0,0.015743,0.146629,0.511639,0.457774,0.070519,0.127492,0.0,...,0.0,0.0,0.137139,0.0,0.0,0.249342,0.0,0.100823,0.0,0.0
PB questions...,0.631929,0.062369,0.019835,0.023254,0.0,0.087839,0.0,0.007947,0.080024,0.301255,...,0.06318,0.03247,0.0,0.0,0.607835,0.111947,0.0,0.102378,0.0,0.017786
Re: Weitek P9000 ?,0.17741,0.049253,0.0,0.0,0.0,0.086839,0.665001,0.0,0.0,0.099171,...,0.040892,0.0,0.0,0.0,0.214705,0.0,0.0,0.0,0.0,0.122066
Re: Shuttle Launch Question,0.0,0.008668,0.404118,0.03023,0.0,0.0,0.0,0.0,0.0,0.00833,...,0.11529,0.12804,0.218447,0.0,0.472157,0.0,0.121696,0.348447,0.080614,0.0


In [21]:
# find similarities
article = df_features.loc['WHAT car is this!?']
print(article)

0     0.000000
1     0.000000
2     0.000000
3     0.000000
4     0.000000
5     0.100638
6     0.000000
7     0.000000
8     0.000000
9     0.000000
10    0.000000
11    0.000000
12    0.000000
13    0.000000
14    0.000000
15    0.000000
16    0.000000
17    0.000000
18    0.000000
19    0.017499
20    0.994745
21    0.000000
22    0.001424
23    0.000000
24    0.000000
25    0.000000
26    0.000000
27    0.000000
28    0.000000
29    0.000000
30    0.000000
31    0.000000
32    0.000000
33    0.006843
34    0.000000
35    0.000000
36    0.000000
37    0.000000
38    0.000000
39    0.000000
Name: WHAT car is this!?, dtype: float64


In [22]:

similarities=df_features.dot(article)
top=similarities.nlargest()

texts = df.loc[top.index]['text'].tolist()
i = 0
for text in texts:
    print('TITLE :'+top.index[i]+" Similarities:"+ str(top[i]))
    #print(text+'\n')
    i = i+1

TITLE :WHAT car is this!? Similarities:1.0
TITLE :Re: WHAT car is this!? Similarities:0.9970616525760385
TITLE :Re: New break pads & exhausts after 96K km (60K mi) on '90 Maxima? Similarities:0.9196440378877496
TITLE :Insurance Rates on Performance Cars SUMMARY Similarities:0.8546576065366513
TITLE :Re: Car Stereo Stolen? Similarities:0.820074996767362


In [23]:
i = 0
for text in texts:
    print('TITLE :'+top.index[i]+" Similarities:"+ str(top[i]))
    print(text+'\n')
    i = i+1

TITLE :WHAT car is this!? Similarities:1.0

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----






TITLE :Re: WHAT car is this!? Similarities:0.9970616525760385

In article <1993Apr20.174246.14375@wam.umd.edu> lerxst@wam.umd.edu (where's my  
thing) writes:
> 
>  I was wondering if anyone out there could enlighten me on this car I saw
> the other day. It was a 2-door sports car, looked to be from the late 60s/
> early 70s. It was called a Bricklin. The doors were really small. In  
addition,
> the