### Bag of Words

##### 1. CountVectorizer

In [1]:
text = 'My wife likes to watch baseball games and my daughter likes to watch baseball games too.'

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
cvect = CountVectorizer()
output = cvect.fit_transform([text])
output.toarray()

array([[1, 2, 1, 2, 2, 2, 2, 1, 2, 1]], dtype=int64)

In [3]:
cvect.vocabulary_

{'my': 5,
 'wife': 9,
 'likes': 4,
 'to': 6,
 'watch': 8,
 'baseball': 1,
 'games': 3,
 'and': 0,
 'daughter': 2,
 'too': 7}

- 불용어(Stopwards) 처리

In [4]:
# 직접 지정
my_set = ['to', 'and', 'my']
cvect = CountVectorizer(stop_words=my_set)
print(cvect.fit_transform([text]).toarray())
print(cvect.vocabulary_)

[[2 1 2 2 1 2 1]]
{'wife': 6, 'likes': 3, 'watch': 5, 'baseball': 0, 'games': 2, 'daughter': 1, 'too': 4}


In [5]:
# Scikit-Learn에서 제공하는 불용어 사용
cvect = CountVectorizer(stop_words='english')
print(cvect.fit_transform([text]).toarray())
print(cvect.vocabulary_)

[[2 1 2 2 2 1]]
{'wife': 5, 'likes': 3, 'watch': 4, 'baseball': 0, 'games': 2, 'daughter': 1}


In [6]:
# NLTK 에서 제공하는 불용어 사용
from nltk.corpus import stopwords
sw = stopwords.words('english')
cvect = CountVectorizer(stop_words=sw)
print(cvect.fit_transform([text]).toarray())
print(cvect.vocabulary_)

[[2 1 2 2 2 1]]
{'wife': 5, 'likes': 3, 'watch': 4, 'baseball': 0, 'games': 2, 'daughter': 1}


- 인덱스에 해당하는 단어를 알려주는 함수

In [7]:
voca = cvect.vocabulary_
for key, value in voca.items():
    print(key, value)

wife 5
likes 3
watch 4
baseball 0
games 2
daughter 1


In [8]:
def get_word(index, voca):
    for key, value in voca.items():
        if value == index:
            return key
        
get_word(4, cvect.vocabulary_)

'watch'

- Cupid

In [19]:
cupid = open('data/cupid(twin_ver).txt').read()
cvect = CountVectorizer(stop_words='english')
print(cvect.fit_transform([cupid]).toarray())
print(cvect.vocabulary_)

[[ 1  1  4  1  1  2  8  1  4  1  4  6  1  3  4  1  1  1  2  4  2  4  4  1
   4  1  1  8  2  1  4  1 10  4  1  2  5  4  1  1  1  2  4  1  1  1  4  2
   4  2]]
{'hopeless': 20, 'romantic': 34, 'life': 23, 'surrounded': 43, 'couples': 4, 'time': 44, 'guess': 18, 'sign': 40, 'oh': 32, 'feeling': 11, 'lonely': 24, 'wish': 49, 'lover': 28, 'hold': 19, 'crying': 5, 'room': 35, 'skeptical': 41, 'love': 27, 'say': 36, 'want': 46, 'gave': 14, 'second': 37, 'chance': 2, 'cupid': 6, 'left': 22, 'stupid': 42, 'way': 48, 'makes': 30, 'feel': 10, 'isn': 21, 'real': 33, 'dumb': 8, 'look': 25, 'arrows': 0, 'everyday': 9, 'got': 17, 'lost': 26, 'flew': 12, 'away': 1, 'waiting': 45, 'waste': 47, 'counting': 3, 'days': 7, 'november': 31, 'loving': 29, 'good': 16, 'girl': 15, 'seeking': 38, 'share': 39, 'fool': 13}


In [11]:
sorted(cvect.vocabulary_.items(), reverse=True)[:5]

[('wish', 49), ('way', 48), ('waste', 47), ('want', 46), ('waiting', 45)]

##### 2. TFIDF(Term Frequency - Inverse Document Frequency)

In [12]:
text = ['My wife likes to watch baseball games and my daughter likes to watch baseball games too.',
        'My wife likes to play baseball.']

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer(stop_words='english')
tvect.fit_transform(text).toarray()

array([[0.38649245, 0.27160082, 0.54320165, 0.38649245, 0.        ,
        0.54320165, 0.19324622],
       [0.44832087, 0.        , 0.        , 0.44832087, 0.63009934,
        0.        , 0.44832087]])

In [14]:
print(tvect.vocabulary_)

{'wife': 6, 'likes': 3, 'watch': 5, 'baseball': 0, 'games': 2, 'daughter': 1, 'play': 4}


In [15]:
cvect = CountVectorizer(stop_words='english')
print(cvect.fit_transform(text).toarray())
print(cvect.vocabulary_)

[[2 1 2 2 0 2 1]
 [1 0 0 1 1 0 1]]
{'wife': 6, 'likes': 3, 'watch': 5, 'baseball': 0, 'games': 2, 'daughter': 1, 'play': 4}


##### 3. N-gram

In [20]:
text = ['I work at google.', 'I google at work.']
cvect = CountVectorizer()
print(cvect.fit_transform(text).toarray())
print(cvect.vocabulary_)

[[1 1 1]
 [1 1 1]]
{'work': 2, 'at': 0, 'google': 1}


In [21]:
cvect = CountVectorizer(ngram_range=(1, 2))
print(cvect.fit_transform(text).toarray())
print(cvect.vocabulary_)

[[1 1 0 1 0 1 1]
 [1 0 1 1 1 1 0]]
{'work': 5, 'at': 0, 'google': 3, 'work at': 6, 'at google': 1, 'google at': 4, 'at work': 2}


### Hyper Parameter

In [22]:
cvect.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 2),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [23]:
tvect.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': 'english',
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}