# 텍스트 분석 - BOW(Bag of Words)
- 딥 러닝을 이용한 자연어 처리 입문에서 발췌

문## Count Vectorizer

In [26]:
from sklearn.feature_extraction.text import CountVectorizer

corpus = ['you know I want your love. because I love you']

cv = CountVectorizer()
cv.fit(corpus)
output = cv.transform(corpus)

print(output.toarray())
print(cv.vocabulary_)

[[1 1 2 1 2 1]]
{'you': 4, 'know': 1, 'want': 3, 'your': 5, 'love': 2, 'because': 0}


## 불용어를 제거한 BoW(Bag of Words)
잘 사용되지 않는 단어 제거

In [27]:
# 불용어 자체 제거
text = ["Family is not an import thing. It's everything."]

cv = CountVectorizer(
    stop_words=['the', 'a', 'an', 'is', 'not']
)

print(cv.fit_transform(text).toarray())
print(cv.vocabulary_)

[[1 1 1 1 1]]
{'family': 1, 'import': 2, 'thing': 4, 'it': 3, 'everything': 0}


In [28]:
# 사이킷런에서 제공하는 불용어 사용
text = ["Family is not an import thing. It's everything."]

cv = CountVectorizer(
    stop_words='english'
)

print(cv.fit_transform(text).toarray())
print(cv.vocabulary_)

[[1 1 1]]
{'family': 0, 'import': 1, 'thing': 2}


In [29]:
# NLTK에서 제공하는 불용어 사용
from nltk.corpus import stopwords
sw = stopwords.words('english')
len(sw)

179

In [30]:
text = ["Family is not an import thing. It's everything."]

cv = CountVectorizer(
    stop_words=sw
)

print(cv.fit_transform(text).toarray())
print(cv.vocabulary_)

[[1 1 1 1]]
{'family': 1, 'import': 2, 'thing': 3, 'everything': 0}


In [31]:
def get_word(index, vocabularies):
    for key, value in vocabularies.items():
        if value == index:
            return key
get_word(3, cv.vocabulary_)

'thing'

## N-gram
- BoW의 단점을 보완한 알고리즘
- 자주사용되는단어 또는 스펠링체크에서 사용

In [32]:
text = ["Machine learning is fun and is not boring"]
cv = CountVectorizer()
print(cv.fit_transform(text).toarray())
print(cv.vocabulary_)

[[1 1 1 2 1 1 1]]
{'machine': 5, 'learning': 4, 'is': 3, 'fun': 2, 'and': 0, 'not': 6, 'boring': 1}


In [33]:
text = ["Machine learning is fun and is not boring"]
cv = CountVectorizer(
    ngram_range=(1,2)
)
print(cv.fit_transform(text).toarray())
print(cv.vocabulary_)

[[1 1 1 1 1 2 1 1 1 1 1 1 1 1]]
{'machine': 10, 'learning': 8, 'is': 5, 'fun': 3, 'and': 0, 'not': 12, 'boring': 2, 'machine learning': 11, 'learning is': 9, 'is fun': 6, 'fun and': 4, 'and is': 1, 'is not': 7, 'not boring': 13}


In [34]:
text = ["Machine learning is fun and is not boring"]
cv = CountVectorizer(
    ngram_range=(1,3)
)
print(cv.fit_transform(text).toarray())
print(cv.vocabulary_)

[[1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1]]
{'machine': 15, 'learning': 12, 'is': 7, 'fun': 4, 'and': 0, 'not': 18, 'boring': 3, 'machine learning': 16, 'learning is': 13, 'is fun': 8, 'fun and': 5, 'and is': 1, 'is not': 10, 'not boring': 19, 'machine learning is': 17, 'learning is fun': 14, 'is fun and': 9, 'fun and is': 6, 'and is not': 2, 'is not boring': 11}


In [35]:
text = ["Machine learning is fun and is not boring"]
cv = CountVectorizer(
    ngram_range=(1,2),
    stop_words='english'
)
print(cv.fit_transform(text).toarray())
print(cv.vocabulary_)

[[1 1 1 1 1 1 1]]
{'machine': 5, 'learning': 3, 'fun': 1, 'boring': 0, 'machine learning': 6, 'learning fun': 4, 'fun boring': 2}


## CountVectorizer의 파라미터

In [36]:
cv.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 2),
 'preprocessor': None,
 'stop_words': 'english',
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

## TF-IDF Vectorizer
(단어 빈도-역 문서 빈도, Term Frequency-Inverse Document Frequency)

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'you know I want your love',
    'I like you',
    'shat should I do'
]

cv = CountVectorizer()
print(cv.fit_transform(corpus).toarray())
print(cv.vocabulary_)

[[0 1 0 1 0 0 1 1 1]
 [0 0 1 0 0 0 0 1 0]
 [1 0 0 0 1 1 0 0 0]]
{'you': 7, 'know': 1, 'want': 6, 'your': 8, 'love': 3, 'like': 2, 'shat': 4, 'should': 5, 'do': 0}


In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'you know I want your love',
    'I like you',
    'shat should I do'
]

tv = TfidfVectorizer()
print(tv.fit_transform(corpus).toarray())
print(tv.vocabulary_)

[[0.         0.46735098 0.         0.46735098 0.         0.
  0.46735098 0.35543247 0.46735098]
 [0.         0.         0.79596054 0.         0.         0.
  0.         0.60534851 0.        ]
 [0.57735027 0.         0.         0.         0.57735027 0.57735027
  0.         0.         0.        ]]
{'you': 7, 'know': 1, 'want': 6, 'your': 8, 'love': 3, 'like': 2, 'shat': 4, 'should': 5, 'do': 0}
