### P053 文本数据 - CountVectorizer向量化

In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

In [3]:
documents = [
    'python is a programming language',
    'python is popular',
    'programming in python',
    'object-oriented programming in python'
]

In [4]:
vectorizer = CountVectorizer()

In [5]:
vectorizer.fit_transform(documents)

<4x8 sparse matrix of type '<class 'numpy.int64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [6]:
df = pd.DataFrame(
    vectorizer.fit_transform(documents).toarray(),
    columns = vectorizer.get_feature_names()
)
df

Unnamed: 0,in,is,language,object,oriented,popular,programming,python
0,0,1,1,0,0,0,1,1
1,0,1,0,0,0,1,0,1
2,1,0,0,0,0,0,1,1
3,1,0,0,1,1,0,1,1


### P054 文本数据 - 计数向量化并配置停用词

In [7]:
vectorizer = CountVectorizer(stop_words='english')

In [8]:
df = pd.DataFrame(
    vectorizer.fit_transform(documents).toarray(),
    columns = vectorizer.get_feature_names()
)
df

Unnamed: 0,language,object,oriented,popular,programming,python
0,1,0,0,0,1,1
1,0,0,0,1,0,1
2,0,0,0,0,1,1
3,0,1,1,0,1,1


### P055 文本数据-计数向量化并配置n-gram

In [10]:
vectorizer = CountVectorizer(stop_words='english', ngram_range=(1,2))

In [11]:
df = pd.DataFrame(
    vectorizer.fit_transform(documents).toarray(),
    columns = vectorizer.get_feature_names()
)
df

Unnamed: 0,language,object,object oriented,oriented,oriented programming,popular,programming,programming language,programming python,python,python popular,python programming
0,1,0,0,0,0,0,1,1,0,1,0,1
1,0,0,0,0,0,1,0,0,0,1,1,0
2,0,0,0,0,0,0,1,0,1,1,0,0
3,0,1,1,1,1,0,1,0,1,1,0,0


### P056 文本数据 - TFIDF实现文本向量化

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
documents = [
    'python is a programming language',
    'python is popular',
    'programming in python',
    'object-oriented programming in python'
]

In [3]:
tfidf_vectorizer = TfidfVectorizer()

In [4]:
df = pd.DataFrame(
    data = tfidf_vectorizer.fit_transform(documents).toarray(),
    columns = tfidf_vectorizer.get_feature_names()
)


In [5]:
df

Unnamed: 0,in,is,language,object,oriented,popular,programming,python
0,0.0,0.519714,0.659191,0.0,0.0,0.0,0.420753,0.343993
1,0.0,0.572892,0.0,0.0,0.0,0.726641,0.0,0.379192
2,0.691131,0.0,0.0,0.0,0.0,0.0,0.55953,0.457453
3,0.433919,0.0,0.0,0.550372,0.550372,0.0,0.351295,0.287207


### P057 文本数据-TFIDF向量化增加停用词

In [6]:
tfidf_vectorizer = TfidfVectorizer(
    stop_words=['is', 'in']
)

In [7]:
df = pd.DataFrame(
    data = tfidf_vectorizer.fit_transform(documents).toarray(),
    columns = tfidf_vectorizer.get_feature_names()
)

In [8]:
df

Unnamed: 0,language,object,oriented,popular,programming,python
0,0.771579,0.0,0.0,0.0,0.492489,0.402642
1,0.0,0.0,0.0,0.886548,0.0,0.462637
2,0.0,0.0,0.0,0.0,0.774191,0.632952
3,0.0,0.610878,0.610878,0.0,0.389916,0.318782
