In [1]:
import numpy as np
import pandas as pd

In [2]:
# 텍스트 데이터 (말뭉치)
# 아래 TEXT 변수의 각 원소가 하나의 문서를 의미합니다.
# CountVectorizer나 TfidfVectorizer 클래스를 이용하기 위해서는 
# 각 문서는 단어들로 구성된 하나의 문자열값으로 저장되어야 합니다.
TEXT = ['banana apple apple eggplant', 
        'orange carrot banana eggplant', 
        'apple carrot banana banana', 
        'orange banana grape'
]

# CountVectorizer 이용하기

In [3]:
from sklearn.feature_extraction.text import CountVectorizer # frequency based DTM
tf_vectorizer = CountVectorizer(min_df=2, max_df=0.8, ngram_range=(1,1))
tf_features = tf_vectorizer.fit_transform(TEXT)

In [4]:
print(tf_features)

  (0, 0)	2
  (0, 2)	1
  (1, 2)	1
  (1, 3)	1
  (1, 1)	1
  (2, 0)	1
  (2, 1)	1
  (3, 3)	1


In [5]:
features = np.array(tf_features.todense())

In [6]:
features

array([[2, 0, 1, 0],
       [0, 1, 1, 1],
       [1, 1, 0, 0],
       [0, 0, 0, 1]], dtype=int64)

In [7]:
# 첫 번째 문서의 벡터
features[0]

array([2, 0, 1, 0], dtype=int64)

In [8]:
features[1]

array([0, 1, 1, 1], dtype=int64)

In [9]:
# 단어들의 이름 확인하기
feature_names = tf_vectorizer.get_feature_names_out()
print(feature_names)

['apple' 'carrot' 'eggplant' 'orange']


In [10]:
import pandas as pd
df = pd.DataFrame(data=features, columns=feature_names)
df

Unnamed: 0,apple,carrot,eggplant,orange
0,2,0,1,0
1,0,1,1,1
2,1,1,0,0
3,0,0,0,1


문서들 간의 유사도 계산해 보기

In [11]:
# 첫 번째 문서와 두 번째 문서 간 유클리디안 거리
print(np.linalg.norm(features[1]-features[0]))

2.449489742783178


In [12]:
# 첫 번째 문서와 두 번째 문서 간 코사인 유사도
print(np.dot(features[0],features[1])/(np.linalg.norm(features[0])*np.linalg.norm(features[1])))

0.2581988897471611


# TfidfVectorizer 이용하기

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer # tf-idf based DTM
tfidf_vectorizer = TfidfVectorizer(min_df=2, max_df=0.8, ngram_range=(1,1))
tfidf_features = tfidf_vectorizer.fit_transform(TEXT)

In [14]:
print(tfidf_features)

  (0, 2)	0.4472135954999579
  (0, 0)	0.8944271909999159
  (1, 1)	0.5773502691896257
  (1, 3)	0.5773502691896257
  (1, 2)	0.5773502691896257
  (2, 1)	0.7071067811865475
  (2, 0)	0.7071067811865475
  (3, 3)	1.0


In [15]:
tfidf_features = np.array(tfidf_features.todense())
tfidf_features

array([[0.89442719, 0.        , 0.4472136 , 0.        ],
       [0.        , 0.57735027, 0.57735027, 0.57735027],
       [0.70710678, 0.70710678, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 1.        ]])

In [16]:
# 첫 번째 문서의 벡터
tfidf_features[0]

array([0.89442719, 0.        , 0.4472136 , 0.        ])

In [17]:
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
df = pd.DataFrame(data=tfidf_features, columns=tfidf_feature_names)
df

Unnamed: 0,apple,carrot,eggplant,orange
0,0.894427,0.0,0.447214,0.0
1,0.0,0.57735,0.57735,0.57735
2,0.707107,0.707107,0.0,0.0
3,0.0,0.0,0.0,1.0


In [18]:
# 벡터의 길이 = 1
np.linalg.norm(tfidf_features[0])

0.9999999999999999

In [45]:
# 첫 번째 문서와 두 번째 문서 간의 유클리디안 거리 구하기
np.linalg.norm(tfidf_features[1]-tfidf_features[0])

1.2180321098007547

### 정규화 (normalization)을 하지 않는 경우

In [19]:
# norm = None 으로 설정
tfidf_vectorizer1 = TfidfVectorizer(min_df=2, max_df=0.8, ngram_range=(1,1), norm=None)
tfidf_features1 = tfidf_vectorizer1.fit_transform(TEXT)

In [21]:
df1 = pd.DataFrame(data=np.array(tfidf_features1.todense()), columns=tfidf_feature_names)
df1

Unnamed: 0,apple,carrot,eggplant,orange
0,3.021651,0.0,1.510826,0.0
1,0.0,1.510826,1.510826,1.510826
2,1.510826,1.510826,0.0,0.0
3,0.0,0.0,0.0,1.510826
