# 英語テキストの前処理

### 自然言語処理用ライブラリ nltk で使用するデータのダウンロード

In [1]:
# アップデートの取得を除き、一度だけ実行
# - import nltk は後で必要になった時に実行
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /Users/kazuya/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /Users/kazuya/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kazuya/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kazuya/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /Users/kazuya/nltk_data...


True

### フィードデータの読み込み

- 1_feeeds で取得した output_en.csv を data フォルダにコピー（等）してください 

In [2]:
import pandas as pd

# フィードデータの読み込み
# - 1_feeeds で取得した output_en.csv
feeds = pd.read_csv('data/output_en.csv')

# 確認
feeds.head()

Unnamed: 0,url,title,summary
0,http://feeds.bbci.co.uk/news/rss.xml,Conservative peer Michelle Mone to take leave ...,Baroness Mone is accused of benefitting from a...
1,http://feeds.bbci.co.uk/news/rss.xml,Ambulance staff to strike on 21 December,"Services across England and Wales affected, bu..."
2,http://feeds.bbci.co.uk/news/rss.xml,Strep A schools may be given preventive antibi...,The drugs would be used to stop more cases of ...
3,http://feeds.bbci.co.uk/news/rss.xml,Eddie Jones sacked by England after review int...,"England sack Eddie Jones, leaving the team wit..."
4,http://feeds.bbci.co.uk/news/rss.xml,Tattooists and beauty salons replace banks on ...,More takeaways are also part of the changing f...


In [3]:
# 1行目を確認
print('Title:', feeds.iloc[0].title)
print('Summary:', feeds.iloc[0].summary)

Title: Conservative peer Michelle Mone to take leave of absence from Lords
Summary: Baroness Mone is accused of benefitting from a company she recommended for a Covid contract.


In [4]:
# title と summary を結合
# str.cat()
# - sep='. ': 間に挟む文字列
# - na_rep='': NaN は空文字列に変換（指定しないと結合結果が NaN になる）
feeds['text'] = feeds['title'].str.cat(feeds['summary'], sep='. ', na_rep='')

# 不要になった列を削除した処理用の DataFrame
df = feeds.drop(['title', 'summary'], axis=1)

# 確認
df.head(5)

Unnamed: 0,url,text
0,http://feeds.bbci.co.uk/news/rss.xml,Conservative peer Michelle Mone to take leave ...
1,http://feeds.bbci.co.uk/news/rss.xml,Ambulance staff to strike on 21 December. Serv...
2,http://feeds.bbci.co.uk/news/rss.xml,Strep A schools may be given preventive antibi...
3,http://feeds.bbci.co.uk/news/rss.xml,Eddie Jones sacked by England after review int...
4,http://feeds.bbci.co.uk/news/rss.xml,Tattooists and beauty salons replace banks on ...


### 英語テキストに対する前処理

- トークン化（単語に分割）
- 小文字化
- ストップワードの除去
- 見出し語化
- ステミング

In [5]:
# 例として最初の行のテキストだけを処理
text = df['text'].iloc[0]

# 確認
text

'Conservative peer Michelle Mone to take leave of absence from Lords. Baroness Mone is accused of benefitting from a company she recommended for a Covid contract.'

In [6]:
import nltk

# トークン化（単語に分割）
tokens = nltk.tokenize.word_tokenize(text)

# 確認
tokens

['Conservative',
 'peer',
 'Michelle',
 'Mone',
 'to',
 'take',
 'leave',
 'of',
 'absence',
 'from',
 'Lords',
 '.',
 'Baroness',
 'Mone',
 'is',
 'accused',
 'of',
 'benefitting',
 'from',
 'a',
 'company',
 'she',
 'recommended',
 'for',
 'a',
 'Covid',
 'contract',
 '.']

In [7]:
# 品詞のタグ付け（見出し語に必要）
tokens_tag = nltk.pos_tag(tokens)

# 確認
tokens_tag

[('Conservative', 'JJ'),
 ('peer', 'NN'),
 ('Michelle', 'NNP'),
 ('Mone', 'NNP'),
 ('to', 'TO'),
 ('take', 'VB'),
 ('leave', 'NN'),
 ('of', 'IN'),
 ('absence', 'NN'),
 ('from', 'IN'),
 ('Lords', 'NNP'),
 ('.', '.'),
 ('Baroness', 'NNP'),
 ('Mone', 'NNP'),
 ('is', 'VBZ'),
 ('accused', 'VBN'),
 ('of', 'IN'),
 ('benefitting', 'VBG'),
 ('from', 'IN'),
 ('a', 'DT'),
 ('company', 'NN'),
 ('she', 'PRP'),
 ('recommended', 'VBD'),
 ('for', 'IN'),
 ('a', 'DT'),
 ('Covid', 'NNP'),
 ('contract', 'NN'),
 ('.', '.')]

In [8]:
# 小文字化
# - lower()
tokens_lower = []

for t in tokens_tag:
    tokens_lower.append((t[0].lower(), t[1]))

# 確認
tokens_lower

[('conservative', 'JJ'),
 ('peer', 'NN'),
 ('michelle', 'NNP'),
 ('mone', 'NNP'),
 ('to', 'TO'),
 ('take', 'VB'),
 ('leave', 'NN'),
 ('of', 'IN'),
 ('absence', 'NN'),
 ('from', 'IN'),
 ('lords', 'NNP'),
 ('.', '.'),
 ('baroness', 'NNP'),
 ('mone', 'NNP'),
 ('is', 'VBZ'),
 ('accused', 'VBN'),
 ('of', 'IN'),
 ('benefitting', 'VBG'),
 ('from', 'IN'),
 ('a', 'DT'),
 ('company', 'NN'),
 ('she', 'PRP'),
 ('recommended', 'VBD'),
 ('for', 'IN'),
 ('a', 'DT'),
 ('covid', 'NNP'),
 ('contract', 'NN'),
 ('.', '.')]

In [9]:
# ストップワードの除去
# - nltk のストップワードを取得
stop_words = nltk.corpus.stopwords.words('english')

# 記号の追加
stop_words += ["'", '"', ':', ';', '.', ',', '-', '!', '?', "'s", '`', '•', '%']

# stop_words に含まれていないトークンのみを残す
tokens_wo_stop_words = []
for t in tokens_lower:
    if t[0] not in stop_words:
        tokens_wo_stop_words.append(t)

# 確認
tokens_wo_stop_words

[('conservative', 'JJ'),
 ('peer', 'NN'),
 ('michelle', 'NNP'),
 ('mone', 'NNP'),
 ('take', 'VB'),
 ('leave', 'NN'),
 ('absence', 'NN'),
 ('lords', 'NNP'),
 ('baroness', 'NNP'),
 ('mone', 'NNP'),
 ('accused', 'VBN'),
 ('benefitting', 'VBG'),
 ('company', 'NN'),
 ('recommended', 'VBD'),
 ('covid', 'NNP'),
 ('contract', 'NN')]

In [10]:
from nltk.corpus import wordnet

# 見出し語化
lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()

# 品詞の名称を変換
def wordnet_tag(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    return None

# 各トークンを見出し語化
tokens_lemmatize = []
for t in tokens_wo_stop_words:
    tag = wordnet_tag(t[1])
    if tag is None:
        t = lemmatizer.lemmatize(t[0])
    else:
        t = lemmatizer.lemmatize(t[0], tag)
    # カンマ区切りが入った数値からカンマを削除
    if t[1] == 'CD':
        t = t.replace(',', '')
    tokens_lemmatize.append(t)

# 確認
tokens_lemmatize

['conservative',
 'peer',
 'michelle',
 'mone',
 'take',
 'leave',
 'absence',
 'lord',
 'baroness',
 'mone',
 'accuse',
 'benefit',
 'company',
 'recommend',
 'covid',
 'contract']

In [11]:
# ステミング
stemmer = nltk.stem.porter.PorterStemmer()

# 各トークンをステミング
tokens_stem = []
for t in tokens_lemmatize:
    s = stemmer.stem(t)
    tokens_stem.append(s)

# 確認
tokens_stem

['conserv',
 'peer',
 'michel',
 'mone',
 'take',
 'leav',
 'absenc',
 'lord',
 'baro',
 'mone',
 'accus',
 'benefit',
 'compani',
 'recommend',
 'covid',
 'contract']

### テキストのベクトル化

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

# トークン化されたリストを結合
text_list = [' '.join(tokens_stem)]

# 初期化
vectorizer = CountVectorizer()

# ベクトル化
vector = vectorizer.fit_transform(text_list)

# 確認
print(vector)
print(vectorizer.get_feature_names_out())

  (0, 5)	1
  (0, 12)	1
  (0, 10)	1
  (0, 11)	2
  (0, 14)	1
  (0, 8)	1
  (0, 0)	1
  (0, 9)	1
  (0, 2)	1
  (0, 1)	1
  (0, 3)	1
  (0, 4)	1
  (0, 13)	1
  (0, 7)	1
  (0, 6)	1
['absenc' 'accus' 'baro' 'benefit' 'compani' 'conserv' 'contract' 'covid'
 'leav' 'lord' 'michel' 'mone' 'peer' 'recommend' 'take']
