In [35]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\taekyung\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Bag of Words

In [36]:
from konlpy.tag import Okt

okt = Okt()

In [52]:
def bag_of_words(document):
    document = document.replace(".","")
    tokenized_words = okt.morphs(document)
    word_to_index = {}
    bow = []
    
    for word in tokenized_words:
        if word not in word_to_index.keys():
            word_to_index[word] = len(word_to_index)
            # print(word_to_index)
            bow.insert(len(word_to_index)-1,1)
        else:
            index = word_to_index.get(word)
            bow[index] = bow[index]+1
            
    return word_to_index, bow
        


In [53]:
doc1 = "정부가 발표하는 물가상승률과 소비자가 느끼는 물가상승률은 다르다."
vocab , bow = bag_of_words(doc1)

print(vocab)
print(bow)

{'정부': 0, '가': 1, '발표': 2, '하는': 3, '물가상승률': 4, '과': 5, '소비자': 6, '느끼는': 7, '은': 8, '다르다': 9}
[1, 2, 1, 1, 2, 1, 1, 1, 1, 1]


In [55]:
doc2 = '소비자는 주로 소비하는 상품을 기준으로 물가상승률을 느낀다.'
vocab, bow = bag_of_words(doc2)
print(vocab)
print(bow)

{'소비자': 0, '는': 1, '주로': 2, '소비': 3, '하는': 4, '상품': 5, '을': 6, '기준': 7, '으로': 8, '물가상승률': 9, '느낀다': 10}
[1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1]


### CountVectorizer 클래스로 BoW 만들기

In [58]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = ['you know I want your love. because I love you.']
vector = CountVectorizer()

# 코퍼스로부터 각 단어의 빈도수를 기록하기
print(vector.fit_transform(corpus).toarray())

# 각 단어의 인덱스가 어떻게 부여되어 있는지 찾기
print(vector.vocabulary_)

[[1 1 2 1 2 1]]
{'you': 4, 'know': 1, 'want': 3, 'your': 5, 'love': 2, 'because': 0}


- 단어 I 가 없어진 이유: countvectorizer가 기본적으로 길이가 2이상인 문자에 대해서만 토큰으로 인식하고 있기 때문
- CountVectorizer가 띄어쓰기만을 기준으로 토큰화하고 Bow를 만든다. 따라서, 영어는 괜찮으나 한국에서는 품질이 떨어진다

In [60]:
# 한국어로 테스트 해보기
corpus = ["정부가 발표하는 물가상승률과 소비자가 느끼는 물가상승률은 다르다."]
vector = CountVectorizer()

print(vector.fit_transform(corpus).toarray())
print(vector.vocabulary_)

[[1 1 1 1 1 1 1]]
{'정부가': 6, '발표하는': 4, '물가상승률과': 2, '소비자가': 5, '느끼는': 0, '물가상승률은': 3, '다르다': 1}


띄어쓰기로만 판단해서 아래처럼 물가상승률과 / 물가상승률은은 따로 인식한다

### 불용어를 제거한 BOW

In [75]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

(1) 사용자가 직접 정의한 불용어를 사용하기

In [65]:
text = ["Family is not an important thing. It's everything."]
vector = CountVectorizer(stop_words=["the",'a','an','is','not'])

print(vector.fit_transform(text).toarray())
print(vector.vocabulary_)

[[1 1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 4, 'it': 3, 'everything': 0}


(2) 자체 내장되어 있는 불용어 사용하기

In [68]:
text = ["Family is not an important thing. It's everything."]
vector = CountVectorizer(stop_words = "english")

print(vector.fit_transform(text).toarray())
print(vector.vocabulary_)

[[1 1 1]]
{'family': 0, 'important': 1, 'thing': 2}


(3) NLTK에서 지원하는 불용어 사용하기

In [78]:
text = ["Family is not an important thing. It's everything."]
stop_words = stopwords.words('english')
# print(stop_words)

vect = CountVectorizer(stop_words = stop_words)
print(vect.fit_transform(text).toarray())
print(vect.vocabulary_)

[[1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 3, 'everything': 0}


### TF-IDF

tf(t,d) = 문서 d에서 t의 출현 횟수 / 문서 d의 총 단어 수

idf(t,D) = ln(문서수 / (1+ 단어 t가 나오는 문서수))

In [89]:
from math import log
import pandas as pd

docs = [
    '먹고 싶은 사과',
    '먹고 싶은 바나나',
    '길고 노란 바나나 바나나',
    '저는 과일이 좋아요'
    ] 

# for doc in docs:
#     for w in doc.split(" "):
#         print(w)

vocab = list(set(w for doc in docs for w in doc.split()))
vocab.sort()
print(vocab)
print(len(vocab))

['과일이', '길고', '노란', '먹고', '바나나', '사과', '싶은', '저는', '좋아요']
9


In [101]:
# 문서의 총수
N = len(docs)

def tf(t,d):
    return d.count(t)

def idf(t):
    # 단어 t가 나오는 문서수 찾기
    df = 0
    for doc in docs:
        # df +=t in doc
        if t in doc:
            df +=1
    return log(N/(1+df))


def tfidf(t,d):
    return tf(t,d)*idf(t)

In [103]:
result = []

for i in range(N):
    result.append([])
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]
        result[-1].append(tf(t,d))
print(result)

tf_ = pd.DataFrame(result,columns = vocab)
tf_

[[0, 0, 0, 1, 0, 1, 1, 0, 0], [0, 0, 0, 1, 1, 0, 1, 0, 0], [0, 1, 1, 0, 2, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 1, 1]]


Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0,0,0,1,0,1,1,0,0
1,0,0,0,1,1,0,1,0,0
2,0,1,1,0,2,0,0,0,0
3,1,0,0,0,0,0,0,1,1


In [106]:
result = []

for j in range(len(vocab)):
    t = vocab[j]
    result.append(idf(t))
    
print(result)

idf_ = pd.DataFrame(result, index = vocab, columns=["IDF"])
idf_

[0.6931471805599453, 0.6931471805599453, 0.6931471805599453, 0.28768207245178085, 0.28768207245178085, 0.6931471805599453, 0.28768207245178085, 0.6931471805599453, 0.6931471805599453]


Unnamed: 0,IDF
과일이,0.693147
길고,0.693147
노란,0.693147
먹고,0.287682
바나나,0.287682
사과,0.693147
싶은,0.287682
저는,0.693147
좋아요,0.693147


In [107]:
result = []
for i in range(N):
    result.append([])
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]
        result[-1].append(tfidf(t,d))
        
tfidf_ = pd.DataFrame(result, columns = vocab)
tfidf_

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0.0,0.0,0.0,0.287682,0.0,0.693147,0.287682,0.0,0.0
1,0.0,0.0,0.0,0.287682,0.287682,0.0,0.287682,0.0,0.0
2,0.0,0.693147,0.693147,0.0,0.575364,0.0,0.0,0.0,0.0
3,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.693147


### 사이킷런을 이용한 DTM(document - term matrix) 와 TF-IDF

In [109]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = [
    'you know I want your love',
    'I like you',
    'what should I do ',    
]

vector = CountVectorizer()

print(vector.fit_transform(corpus).toarray())
print(vector.vocabulary_)

[[0 1 0 1 0 1 0 1 1]
 [0 0 1 0 0 0 0 1 0]
 [1 0 0 0 1 0 1 0 0]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


In [123]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'you know I want your love',
    'I like you',
    'what should I do ',    
]

tfid_v = TfidfVectorizer().fit(corpus)
# print(tfid_v.transform(corpus).toarray())
tfidf_df= pd.DataFrame(tfid_v.transform(corpus).toarray(), columns = vocab)
print(tfid_v.vocabulary_)
tfidf_df

{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0.0,0.467351,0.0,0.467351,0.0,0.467351,0.0,0.355432,0.467351
1,0.0,0.0,0.795961,0.0,0.0,0.0,0.0,0.605349,0.0
2,0.57735,0.0,0.0,0.0,0.57735,0.0,0.57735,0.0,0.0
