# count vectorizer

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['you know I want your love, because I love you']

In [30]:
count_vect = CountVectorizer()
output = count_vect.fit_transform(corpus)

In [31]:
count_vect.vocabulary_ # 각 단어의 index 넘버 가르쳐줌

{'you': 4, 'know': 1, 'want': 3, 'your': 5, 'love': 2, 'because': 0}

In [32]:
output.toarray() # 각 단어의 개수

array([[1, 1, 2, 1, 2, 1]], dtype=int64)

- 불용어 제거

In [33]:
# 자체 제거
text = ["Family is not an important thing. It's everything"]
cvect = CountVectorizer(stop_words=['the','a','an','is'])
print(cvect.fit_transform(text).toarray())
print(cvect.vocabulary_)


[[1 1 1 1 1 1]]
{'family': 1, 'not': 4, 'important': 2, 'thing': 5, 'it': 3, 'everything': 0}


In [34]:
# sklearn에서 불용어 사용
cvect = CountVectorizer(stop_words=['english'])
print(cvect.fit_transform(text).toarray())
print(cvect.vocabulary_)


[[1 1 1 1 1 1 1 1]]
{'family': 2, 'is': 4, 'not': 6, 'an': 0, 'important': 3, 'thing': 7, 'it': 5, 'everything': 1}


In [35]:
# nltk에서 제공하는 불용어사용
from nltk.corpus import stopwords
sw = stopwords.words('english')
len(sw)

179

In [36]:
cvect = CountVectorizer(stop_words=sw)
print(cvect.fit_transform(text).toarray())
print(cvect.vocabulary_)

[[1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 3, 'everything': 0}


In [37]:
def get_word(index, voca):
    for key, value in voca.items():
        if value == index :
            return key
            

In [38]:
get_word(3,cvect.vocabulary_)

'thing'

N-gram

In [39]:
text = ['I work at google']
cvect = CountVectorizer(ngram_range=(1,2))
print(cvect.fit_transform(text).toarray())
print(cvect.vocabulary_)

[[1 1 1 1 1]]
{'work': 3, 'at': 0, 'google': 2, 'work at': 4, 'at google': 1}


In [40]:
text = ['I goolge at work']
cvect = CountVectorizer(ngram_range=(1,2))
print(cvect.fit_transform(text).toarray())
print(cvect.vocabulary_)

[[1 1 1 1 1]]
{'goolge': 2, 'at': 0, 'work': 4, 'goolge at': 3, 'at work': 1}


In [41]:
text = ['I goolge at work','I work at google']
cvect = CountVectorizer(ngram_range=(1,2))
print(cvect.fit_transform(text).toarray())
print(cvect.vocabulary_)

[[1 0 1 0 1 1 1 0]
 [1 1 0 1 0 0 1 1]]
{'goolge': 4, 'at': 0, 'work': 6, 'goolge at': 5, 'at work': 2, 'google': 3, 'work at': 7, 'at google': 1}


- hyper parameter

In [42]:
cvect.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 2),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

### TfidfVectorizer

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = ['you know I want your love.','I like you.','what should I do']

In [44]:
cvevt = CountVectorizer()
print(cvect.fit_transform(corpus).toarray())
print(cvect.vocabulary_)

[[0 1 1 0 0 1 0 0 1 1 0 0 1 1 1 1]
 [0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0]
 [1 0 0 0 0 0 1 1 0 0 1 1 0 0 0 0]]
{'you': 12, 'know': 1, 'want': 8, 'your': 14, 'love': 5, 'you know': 13, 'know want': 2, 'want your': 9, 'your love': 15, 'like': 3, 'like you': 4, 'what': 10, 'should': 6, 'do': 0, 'what should': 11, 'should do': 7}


In [45]:
tvect = TfidfVectorizer()
print(tvect.fit_transform(corpus).toarray())
print(tvect.vocabulary_)

[[0.         0.46735098 0.         0.46735098 0.         0.46735098
  0.         0.35543247 0.46735098]
 [0.         0.         0.79596054 0.         0.         0.
  0.         0.60534851 0.        ]
 [0.57735027 0.         0.         0.         0.57735027 0.
  0.57735027 0.         0.        ]]
{'you': 7, 'know': 1, 'want': 5, 'your': 8, 'love': 3, 'like': 2, 'what': 6, 'should': 4, 'do': 0}


In [47]:
tvect = TfidfVectorizer(ngram_range=(1,2), stop_words='english')
print(tvect.fit_transform(corpus).toarray())
print(tvect.vocabulary_)

[[0.4472136 0.4472136 0.        0.4472136 0.4472136 0.4472136]
 [0.        0.        1.        0.        0.        0.       ]
 [0.        0.        0.        0.        0.        0.       ]]
{'know': 0, 'want': 4, 'love': 3, 'know want': 1, 'want love': 5, 'like': 2}


In [48]:
tvect.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 2),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': 'english',
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}