# テキストのベクトル化

- Bag of Words (BoW)
- TF-IDF

In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import wordnet

# フィードデータの読み込み
feeds = pd.read_csv('data/output_en.csv')

# title と summary を結合して text 列を作成
feeds['text'] = feeds['title'].str.cat(feeds['summary'], sep='. ', na_rep='')

# 不要になった列を削除した処理用の DataFrame
df = feeds.drop(['title', 'summary'], axis=1)

# 確認
df

Unnamed: 0,url,text
0,http://feeds.bbci.co.uk/news/rss.xml,Conservative peer Michelle Mone to take leave ...
1,http://feeds.bbci.co.uk/news/rss.xml,Ambulance staff to strike on 21 December. Serv...
2,http://feeds.bbci.co.uk/news/rss.xml,Strep A schools may be given preventive antibi...
3,http://feeds.bbci.co.uk/news/rss.xml,Eddie Jones sacked by England after review int...
4,http://feeds.bbci.co.uk/news/rss.xml,Tattooists and beauty salons replace banks on ...
...,...,...
783,http://feeds.bbci.co.uk/news/science_and_envir...,Nasa's Orion capsule makes safe return to Eart...
784,http://feeds.bbci.co.uk/news/science_and_envir...,One of Central America's most active volcanoes...
785,http://feeds.bbci.co.uk/news/technology/rss.xml,December 2024 set as date for universal phone ...
786,http://feeds.bbci.co.uk/news/technology/rss.xml,Twitter's paid blue tick re-launches after pau...


### 英語テキストに対する前処理

以下をまとめて行う関数 preprocess() を定義
- トークン化（単語に分割）
- 小文字化
- ストップワードの除去
- ステミング
- 見出し語化

In [2]:
symbols_to_remove = r'["`,.' + r"'" + r']'
stop_words = nltk.corpus.stopwords.words('english')
stop_words += ["'", '"', ':', ';', '.', ',', '-', '!', '?', "'s", '`', '•', '%']
stop_words += ['–', '—', '‘', '’', '“', '”', '…', '|', '#', '$', '&', "''", '(', ')']
stemmer = nltk.stem.porter.PorterStemmer()
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

# 品詞の名称を変換
def wordnet_tag(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    return None

def preprocess(text):
    tokens = []
    # 品詞のタグ付けをした各トークンについて
    for t in nltk.pos_tag(nltk.tokenize.word_tokenize(text.replace('-', ' '))):
        # 小文字化
        t0 = t[0].lower()
        # 不要な文字の削除
        t0 = re.sub(symbols_to_remove, '', t0)
        # 空文字列になったら次へ
        if t0 == '':
            continue
        # stop_words に含まれていないトークンのみを残す
        if t0 in stop_words:
            continue
        # カンマ区切りが入った数値からカンマを削除
        if t[1] == 'CD':
            t0 = t0.replace(',', '')
        # 見出し語化
        tag = wordnet_tag(t[1])
        if tag is None:
            t0 = lemmatizer.lemmatize(t0)
        else:
            t0 = lemmatizer.lemmatize(t0, tag)
        # ステミング
        t0 = stemmer.stem(t0)
        # リストに追加
        tokens.append(t0)
    # トークンのリストを返す
    return tokens

### テキストのベクトル化 (1)

- Bag of Words (BoW)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# 例として最初の2行のテキストを処理
text_list = [df['text'].iloc[0], df['text'].iloc[1]]

# CountVectorizer
# - tokenizer=preprocess: トークン化処理に上で定義した preprocess を使用することを指定
vectorizer = CountVectorizer(tokenizer=preprocess)

# ベクトル化
vector = vectorizer.fit_transform(text_list)

In [4]:
# 1行目
print(text_list[0])
print(vector[0])

Conservative peer Michelle Mone to take leave of absence from Lords. Baroness Mone is accused of benefitting from a company she recommended for a Covid contract.
  (0, 10)	1
  (0, 20)	1
  (0, 18)	1
  (0, 19)	2
  (0, 26)	1
  (0, 15)	1
  (0, 1)	1
  (0, 17)	1
  (0, 6)	1
  (0, 2)	1
  (0, 7)	1
  (0, 9)	1
  (0, 21)	1
  (0, 12)	1
  (0, 11)	1


In [5]:
# ベクトルの単語との対応
for i in vector[0].indices:
    # print()
    # - end=' ': 改行の代わりに空白を出力
    print(vectorizer.get_feature_names_out()[i], end=' ')

conserv peer michel mone take leav absenc lord baro accus benefit compani recommend covid contract 

In [6]:
# 単語と頻度
[[vectorizer.get_feature_names_out()[i], vector[0, i]] for i in vector[0].indices]

[['conserv', 1],
 ['peer', 1],
 ['michel', 1],
 ['mone', 2],
 ['take', 1],
 ['leav', 1],
 ['absenc', 1],
 ['lord', 1],
 ['baro', 1],
 ['accus', 1],
 ['benefit', 1],
 ['compani', 1],
 ['recommend', 1],
 ['covid', 1],
 ['contract', 1]]

In [7]:
# 2行目
print(text_list[1])
print(vector[1])

Ambulance staff to strike on 21 December. Services across England and Wales affected, but life-threatening calls will be responded to.
  (0, 5)	1
  (0, 24)	1
  (0, 25)	1
  (0, 0)	1
  (0, 13)	1
  (0, 23)	1
  (0, 3)	1
  (0, 14)	1
  (0, 28)	1
  (0, 4)	1
  (0, 16)	1
  (0, 27)	1
  (0, 8)	1
  (0, 22)	1


In [8]:
# ベクトルの単語との対応
for i in vector[1].indices:
    print(vectorizer.get_feature_names_out()[i], end=' ')

ambul staff strike 21 decemb servic across england wale affect life threaten call respond 

In [9]:
# 単語と頻度
[[vectorizer.get_feature_names_out()[i], vector[0, i]] for i in vector[0].indices]

[['conserv', 1],
 ['peer', 1],
 ['michel', 1],
 ['mone', 2],
 ['take', 1],
 ['leav', 1],
 ['absenc', 1],
 ['lord', 1],
 ['baro', 1],
 ['accus', 1],
 ['benefit', 1],
 ['compani', 1],
 ['recommend', 1],
 ['covid', 1],
 ['contract', 1]]

### テキストのベクトル化 (2)

- TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 例として最初の2行のテキストを処理
text_list = [df['text'].iloc[0], df['text'].iloc[1]]

# TfidfVectorizer
# - tokenizer=preprocess: トークン化処理に上で定義した preprocess を使用することを指定
vectorizer = TfidfVectorizer(tokenizer=preprocess)

# ベクトル化
vector = vectorizer.fit_transform(text_list)

In [None]:
# 1行目
print(text_list[0])
print(vector[0])

In [None]:
# ベクトルの単語との対応
for i in vector[0].indices:
    print(vectorizer.get_feature_names_out()[i], end=' ')

In [None]:
# 単語とTF-IDF
[[vectorizer.get_feature_names_out()[i], vector[0, i]] for i in vector[0].indices]