# Bag of Words
### 1. CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
text = 'The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research.'

In [None]:
cvect = CountVectorizer()
cvect.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [None]:
output = cvect.fit_transform([text])
output

<1x15 sparse matrix of type '<class 'numpy.int64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [None]:
output.toarray()

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 1, 1, 1]])

In [None]:
cvect.vocabulary_

{'been': 0,
 'collected': 1,
 'collection': 2,
 'for': 3,
 'have': 4,
 'is': 5,
 'messages': 6,
 'of': 7,
 'research': 8,
 'set': 9,
 'sms': 10,
 'spam': 11,
 'tagged': 12,
 'that': 13,
 'the': 14}

- 불용어 처리

In [None]:
# 자체 제거
cvect = CountVectorizer(stop_words = ['is', 'a', 'the'])
print(cvect.fit_transform([text]).toarray())
print(cvect.vocabulary_)

[[1 1 1 1 1 1 1 1 1 3 2 1 1]]
{'sms': 9, 'spam': 10, 'collection': 2, 'set': 8, 'of': 6, 'tagged': 11, 'messages': 5, 'that': 12, 'have': 4, 'been': 0, 'collected': 1, 'for': 3, 'research': 7}


In [None]:
# Scikit-learn에서 제공하는 불용어 사용
cvect = CountVectorizer(stop_words = 'english')
print(cvect.fit_transform([text]).toarray())
print(cvect.vocabulary_)

[[1 1 1 1 1 3 2 1]]
{'sms': 5, 'spam': 6, 'collection': 1, 'set': 4, 'tagged': 7, 'messages': 2, 'collected': 0, 'research': 3}


In [10]:
# NLTK 불용어 사용
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
from nltk.corpus import stopwords
sw = stopwords.words('english')
len(sw), type(sw)

(179, list)

In [13]:
cvect = CountVectorizer(stop_words = sw)
print(cvect.fit_transform([text]).toarray())
print(cvect.vocabulary_)

[[1 1 1 1 1 3 2 1]]
{'sms': 5, 'spam': 6, 'collection': 1, 'set': 4, 'tagged': 7, 'messages': 2, 'collected': 0, 'research': 3}


- 인덱스에 해당하는 단어가 무엇인지 알려주는 함수 제작

In [14]:
voca = cvect.vocabulary_
for key, value in voca.items():
  print(key, value)

sms 5
spam 6
collection 1
set 4
tagged 7
messages 2
collected 0
research 3


In [15]:
def get_word(index, voca):
  for key, value in voca.items():
    if value == index:
      return key

In [16]:
get_word(3, cvect.vocabulary_)

'research'

2. N-gram

In [18]:
text = ['I work at google.', 'I google at work']
cvect = CountVectorizer()
print(cvect.fit_transform(text).toarray())
print(cvect.vocabulary_)

[[1 1 1]
 [1 1 1]]
{'work': 2, 'at': 0, 'google': 1}


In [19]:
cvect = CountVectorizer(ngram_range = (1,2))
print(cvect.fit_transform(text).toarray())
print(cvect.vocabulary_)

[[1 1 0 1 0 1 1]
 [1 0 1 1 1 1 0]]
{'work': 5, 'at': 0, 'google': 3, 'work at': 6, 'at google': 1, 'google at': 4, 'at work': 2}


### 3. TfidfVectorizer(Term Frequency, Inverse Document Frequency)

In [22]:
text = ['The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research.', 
        'It contains one set of SMS messages in English of 5,574 messages, tagged acording being ham (legitimate) or spam.']

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer(stop_words = 'english')
print(tvect.fit_transform(text).toarray())

[[0.         0.         0.30015142 0.30015142 0.         0.
  0.         0.         0.21356021 0.30015142 0.21356021 0.64068062
  0.42712041 0.21356021]
 [0.31544091 0.31544091 0.         0.         0.31544091 0.31544091
  0.31544091 0.31544091 0.44887761 0.         0.2244388  0.2244388
  0.2244388  0.2244388 ]]


In [24]:
cvect = CountVectorizer(stop_words='english')
print(cvect.fit_transform(text).toarray())
print(cvect.vocabulary_)

[[0 0 1 1 0 0 0 0 1 1 1 3 2 1]
 [1 1 0 0 1 1 1 1 2 0 1 1 1 1]]
{'sms': 11, 'spam': 12, 'collection': 3, 'set': 10, 'tagged': 13, 'messages': 8, 'collected': 2, 'research': 9, 'contains': 4, 'english': 5, '574': 0, 'acording': 1, 'ham': 6, 'legitimate': 7}
