# Bag of Words

### 1. Count Vectorizer

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
corpus = ['you know i want your love. because i love you.']

In [6]:
cvet = CountVectorizer()
cvet.fit(corpus)
output = cvet.transform(corpus)
output.toarray()

array([[1, 1, 2, 1, 2, 1]], dtype=int64)

In [7]:
cvet.vocabulary_

{'you': 4, 'know': 1, 'want': 3, 'your': 5, 'love': 2, 'because': 0}

### 불용어를 제거한 BOW

In [9]:
# 1) 자체 제거
text = ["family is not an important thing. it's everything."]
cvet = CountVectorizer(stop_words=['the', 'a', 'an', 'is', 'are', 'not'])

### 2) Scikit-learn에서 제공하는 불용어


In [12]:
output = cvet.fit_transform(text)
print(output.toarray())
print(cvet.vocabulary_)

[[1 1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 4, 'it': 3, 'everything': 0}


In [15]:
cvet = CountVectorizer(stop_words='english')
output = cvet.fit_transform(text)
print(output.toarray())
print(cvet.vocabulary_)

[[1 1 1]]
{'family': 0, 'important': 1, 'thing': 2}


### 3) NLTK에서 제공하는 불용어 사전

In [17]:
from nltk.corpus import stopwords
sw = stopwords.words('english')
len(sw)

179

In [20]:
cvet = CountVectorizer(stop_words=sw)
print(cvet.fit_transform(text).toarray())
print(cvet.vocabulary_)

[[1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 3, 'everything': 0}


### N-gram

In [22]:
text = ['Machine learning is fun and is not boring.']
cvet = CountVectorizer()
print(cvet.fit_transform(text).toarray())
print(cvet.vocabulary_)

[[1 1 1 2 1 1 1]]
{'machine': 5, 'learning': 4, 'is': 3, 'fun': 2, 'and': 0, 'not': 6, 'boring': 1}


In [23]:
cvet.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [24]:
# unigram ~ bigram 둘 다 사용
cvet2 = CountVectorizer(ngram_range=(1, 2))
print(cvet2.fit_transform(text).toarray())
print(cvet2.vocabulary_)

[[1 1 1 1 1 2 1 1 1 1 1 1 1 1]]
{'machine': 10, 'learning': 8, 'is': 5, 'fun': 3, 'and': 0, 'not': 12, 'boring': 2, 'machine learning': 11, 'learning is': 9, 'is fun': 6, 'fun and': 4, 'and is': 1, 'is not': 7, 'not boring': 13}


In [25]:
# unigram ~ trigram 둘 다 사용
cvet3 = CountVectorizer(ngram_range=(1, 3))
print(cvet3.fit_transform(text).toarray())
print(cvet3.vocabulary_)

[[1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1]]
{'machine': 15, 'learning': 12, 'is': 7, 'fun': 4, 'and': 0, 'not': 18, 'boring': 3, 'machine learning': 16, 'learning is': 13, 'is fun': 8, 'fun and': 5, 'and is': 1, 'is not': 10, 'not boring': 19, 'machine learning is': 17, 'learning is fun': 14, 'is fun and': 9, 'fun and is': 6, 'and is not': 2, 'is not boring': 11}


In [26]:
# typical : unigram ~ bigram, stopwords
cvet = CountVectorizer(ngram_range=(1,2), stop_words='english')

In [27]:
print(cvet.fit_transform(text).toarray())
print(cvet.vocabulary_)

[[1 1 1 1 1 1 1]]
{'machine': 5, 'learning': 3, 'fun': 1, 'boring': 0, 'machine learning': 6, 'learning fun': 4, 'fun boring': 2}


## TF-IDF Vector

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
corpus = [
    'you know i want your love', 
    'i like you',
    'what should i do'
]

In [31]:
cvet = CountVectorizer()
print(cvet.fit_transform(corpus).toarray())
print(cvet.vocabulary_)

[[0 1 0 1 0 1 0 1 1]
 [0 0 1 0 0 0 0 1 0]
 [1 0 0 0 1 0 1 0 0]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


In [34]:
tvet = TfidfVectorizer()
tvet.fit(corpus)
output = tvet.transform(corpus)
print(tvet.vocabulary_)

{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


In [37]:
output.toarray()

array([[0.        , 0.46735098, 0.        , 0.46735098, 0.        ,
        0.46735098, 0.        , 0.35543247, 0.46735098],
       [0.        , 0.        , 0.79596054, 0.        , 0.        ,
        0.        , 0.        , 0.60534851, 0.        ],
       [0.57735027, 0.        , 0.        , 0.        , 0.57735027,
        0.        , 0.57735027, 0.        , 0.        ]])

In [38]:
tvet.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

In [41]:
# typical : unigram ~ bigram, stopwords
tvet = TfidfVectorizer(ngram_range=(1,2), stop_words='english')
print(tvet.fit_transform(corpus).toarray())
print(tvet.vocabulary_)

[[0.4472136 0.4472136 0.        0.4472136 0.4472136 0.4472136]
 [0.        0.        1.        0.        0.        0.       ]
 [0.        0.        0.        0.        0.        0.       ]]
{'know': 0, 'want': 4, 'love': 3, 'know want': 1, 'want love': 5, 'like': 2}
