In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
text = [
    'This is the bed bed.',
    'This is the second bed.',
    'And the third one.',
    'Is this the first document?',
    'How are',
]
vectorizer = CountVectorizer()
count = vectorizer.fit_transform(text)

In [2]:
count

<5x12 sparse matrix of type '<class 'numpy.int64'>'
	with 20 stored elements in Compressed Sparse Row format>

#### 看到所有文本的关键字  

In [3]:
print(vectorizer.get_feature_names())

['and', 'are', 'bed', 'document', 'first', 'how', 'is', 'one', 'second', 'the', 'third', 'this']


#### 文本单词编号

In [5]:
print(vectorizer.vocabulary_)

{'this': 11, 'is': 6, 'the': 9, 'bed': 2, 'second': 8, 'and': 0, 'third': 10, 'one': 7, 'first': 4, 'document': 3, 'how': 5, 'are': 1}


#### 词频矩阵

In [8]:
print(count.toarray())

[[0 0 2 0 0 0 1 0 0 1 0 1]
 [0 0 1 0 0 0 1 0 1 1 0 1]
 [1 0 0 0 0 0 0 1 0 1 1 0]
 [0 0 0 1 1 0 1 0 0 1 0 1]
 [0 1 0 0 0 1 0 0 0 0 0 0]]


#### tf-idf代码示例

In [10]:
transformer = TfidfTransformer()
tfidf_matrix = transformer.fit_transform(count)
print(tfidf_matrix.toarray())

[[0.         0.         0.82578944 0.         0.         0.
  0.34273991 0.         0.         0.28832362 0.         0.34273991]
 [0.         0.         0.47662209 0.         0.         0.
  0.39563939 0.         0.59076079 0.33282432 0.         0.39563939]
 [0.54903633 0.         0.         0.         0.         0.
  0.         0.54903633 0.         0.30931749 0.54903633 0.        ]
 [0.         0.         0.         0.55776107 0.55776107 0.
  0.37353909 0.         0.         0.31423286 0.         0.37353909]
 [0.         0.70710678 0.         0.         0.         0.70710678
  0.         0.         0.         0.         0.         0.        ]]


#### Tokenizer 原理
生成了一个字典，并且统计了词频等信息，并没有把文本转成需要的向量表示。

In [2]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [3]:
text = [
    'This is the bed bed.',
    'This is the second bed.',
    'And the third one.',
    'Is this the first bed?',
    'How are you',
]
tok = Tokenizer()
tok.fit_on_texts(text)
tok.word_index

{'the': 1,
 'bed': 2,
 'this': 3,
 'is': 4,
 'second': 5,
 'and': 6,
 'third': 7,
 'one': 8,
 'first': 9,
 'how': 10,
 'are': 11,
 'you': 12}

In [4]:
tok.word_counts

OrderedDict([('this', 3),
             ('is', 3),
             ('the', 4),
             ('bed', 4),
             ('second', 1),
             ('and', 1),
             ('third', 1),
             ('one', 1),
             ('first', 1),
             ('how', 1),
             ('are', 1),
             ('you', 1)])

In [5]:
tok.texts_to_matrix(text)

array([[0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0.],
       [0., 1., 1., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.]])

In [7]:
max_features = 300
tokenizer = Tokenizer(num_words=max_features,lower=True)
tokenizer.fit_on_texts(text)

In [8]:
text=tokenizer.texts_to_sequences(text)

In [9]:
text

[[3, 4, 1, 2, 2], [3, 4, 1, 5, 2], [6, 1, 7, 8], [4, 3, 1, 9, 2], [10, 11, 12]]

In [30]:
from keras.preprocessing.sequence import pad_sequences
pad_sequences(text,maxlen=100) 

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,
         3,  1,  4,  4],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,
         3,  1,  5,  4],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0

In [31]:
tokenizer.word_index

{'the': 1,
 'this': 2,
 'is': 3,
 'bed': 4,
 'second': 5,
 'and': 6,
 'third': 7,
 'one': 8,
 'first': 9,
 'document': 10,
 'how': 11,
 'are': 12}

In [12]:
from tqdm import tqdm
for word, i in tqdm(tokenizer.word_index.items()):
    print(word)

100%|███████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 12041.06it/s]

the
bed
this
is
second
and
third
one
first
how
are
you





In [10]:
from tqdm import tqdm
for word, i in tqdm(tokenizer.word_index.items()):
    print(i)

100%|███████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 12015.19it/s]

1
2
3
4
5
6
7
8
9
10
11
12





In [11]:
tokenizer.word_index

{'the': 1,
 'bed': 2,
 'this': 3,
 'is': 4,
 'second': 5,
 'and': 6,
 'third': 7,
 'one': 8,
 'first': 9,
 'how': 10,
 'are': 11,
 'you': 12}