### 前処理
https://towardsdatascience.com/nlp-for-beginners-cleaning-preprocessing-text-data-ae8e306bef0f

In [1]:
import numpy as np
import pandas as pd
import preprocessor as p
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

In [3]:
pd.options.display.float_format = '{:0.2f}'.format

In [4]:
df = pd.read_csv('corpusUSD_YPN_news.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,word
0,0,the corrective forces through dollar/yen were ...
1,1,daily market outlook on major. update time: 04...
2,2,usd/jpy had a very bearish last week. the pair...
3,3,although global risk headlines have been quite...
4,4,the usd/jpy was closed at 108.169 after placin...


In [5]:
del df['Unnamed: 0']

In [6]:
def remove_punctuation(text):
    no_punc = "".join([x for x in text if x not in string.punctuation])
    return no_punc

In [8]:
df['word'] = df['word'].apply(lambda x: remove_punctuation(x))
df.head()

Unnamed: 0,word
0,the corrective forces through dollaryen were h...
1,daily market outlook on major update time 04 n...
2,usdjpy had a very bearish last week the pair p...
3,although global risk headlines have been quite...
4,the usdjpy was closed at 108169 after placing ...


In [9]:
# instantiate tokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [10]:
df['word'] = df['word'].apply(lambda x: tokenizer.tokenize(x.lower()))
df.head()

Unnamed: 0,word
0,"[the, corrective, forces, through, dollaryen, ..."
1,"[daily, market, outlook, on, major, update, ti..."
2,"[usdjpy, had, a, very, bearish, last, week, th..."
3,"[although, global, risk, headlines, have, been..."
4,"[the, usdjpy, was, closed, at, 108169, after, ..."


In [11]:
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

In [12]:
df['word'] = df['word'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,word
0,"[corrective, forces, dollaryen, held, touch, f..."
1,"[daily, market, outlook, major, update, time, ..."
2,"[usdjpy, bearish, last, week, pair, produced, ..."
3,"[although, global, risk, headlines, quite, wee..."
4,"[usdjpy, closed, 108169, placing, high, 108322..."


In [13]:
# instantiate lemmatizer
lemmatizer = WordNetLemmatizer()

In [14]:
def word_lemmatizer(text):
    lem_text = [lemmatizer.lemmatize(x) for x in text]
    return lem_text

In [15]:
df['word'] = df['word'].apply(lambda x: word_lemmatizer(x))
df.head()

Unnamed: 0,word
0,"[corrective, force, dollaryen, held, touch, fr..."
1,"[daily, market, outlook, major, update, time, ..."
2,"[usdjpy, bearish, last, week, pair, produced, ..."
3,"[although, global, risk, headline, quite, week..."
4,"[usdjpy, closed, 108169, placing, high, 108322..."


In [16]:
# instantiate stemmer
stemmer = PorterStemmer()

In [17]:
def word_stemmer(text):
    stem_txt = " ".join([stemmer.stem(x) for x in text])
    return stem_txt

In [18]:
df['word'] = df['word'].apply(lambda x: word_stemmer(x))
df.head()

Unnamed: 0,word
0,correct forc dollaryen held touch friday mix d...
1,daili market outlook major updat time 04 nov 2...
2,usdjpi bearish last week pair produc engulf be...
3,although global risk headlin quit weekend usdj...
4,usdjpi close 108169 place high 108322 low 1078...


In [19]:
corpus = df.word.as_matrix()
corpus = [p.clean(x).lower() for x in corpus]
corpus[:5]

  """Entry point for launching an IPython kernel.


['correct forc dollaryen held touch friday mix data point posit payrol disappoint ism',
 'daili market outlook major updat time nov gmt usdjpi dollar selloff last wednesday high',
 'usdjpi bearish last week pair produc engulf bearish weekli candl resist doubl top recent',
 'although global risk headlin quit weekend usdjpi fail extend bounc day ema amid initi',
 'usdjpi close place high low overal movement pair remain bullish']

In [20]:
# SentimentIntensityAnalyzer(pos, neg, neu)
word = []
pos = []
neg = []
neu = []
sid = SIA()
for x in corpus:
    ss = sid.polarity_scores(x)
    word.append(x)
    pos.append(ss['pos'])
    neg.append(ss['neg'])
    neu.append(ss['neu'])

In [21]:
df = pd.DataFrame(data={'word':word,
                       'pos':pos,
                       'neg':neg,
                       'neu':neu})
df.head()

Unnamed: 0,word,pos,neg,neu
0,correct forc dollaryen held touch friday mix d...,0.0,0.18,0.82
1,daili market outlook major updat time nov gmt ...,0.0,0.0,1.0
2,usdjpi bearish last week pair produc engulf be...,0.12,0.0,0.88
3,although global risk headlin quit weekend usdj...,0.09,0.31,0.6
4,usdjpi close place high low overal movement pa...,0.0,0.19,0.81


### tf-idf
https://blog.amedama.jp/entry/tf-idf

In [22]:
# 単語の数をカウントする
count_vectorizer = CountVectorizer()
bow = count_vectorizer.fit_transform(corpus).toarray()
bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [23]:
print('--- BoW (Bag of Words) ---')
df1 = pd.DataFrame(bow,
                  columns=count_vectorizer.get_feature_names())
df1.head()

--- BoW (Bag of Words) ---


Unnamed: 0,accord,action,afternoon,ago,although,altitud,american,amid,announc,appetit,...,wed,wedg,wednesday,week,weekend,weekli,win,within,write,yield
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,0
3,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
df1.iloc[0][df1.iloc[0] == 1]

correct       1
data          1
disappoint    1
dollaryen     1
forc          1
friday        1
held          1
ism           1
mix           1
payrol        1
point         1
posit         1
touch         1
Name: 0, dtype: int64

In [28]:
# TF を計算してるところ (行方向の処理)
print('--- TF (Term Frequency) ---')
# 文書に含まれる単語の数をカウントする
number_of_words = np.sum(bow, axis=1, keepdims=True)
# 文書の中での単語の頻度を計算する
tf = bow / number_of_words
df2 = pd.DataFrame(tf,
                  columns=count_vectorizer.get_feature_names())
df2.head()

--- TF (Term Frequency) ---


Unnamed: 0,accord,action,afternoon,ago,although,altitud,american,amid,announc,appetit,...,wed,wedg,wednesday,week,weekend,weekli,win,within,write,yield
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.07,0.0,0.07,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.07,0.0,0.0,0.07,0.0,0.0,...,0.0,0.0,0.0,0.0,0.07,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
df2.iloc[0][df2.iloc[0] > 0.05]

correct      0.08
data         0.08
disappoint   0.08
dollaryen    0.08
forc         0.08
friday       0.08
held         0.08
ism          0.08
mix          0.08
payrol       0.08
point        0.08
posit        0.08
touch        0.08
Name: 0, dtype: float64

In [32]:
# IDF を計算してるところ (列方向の処理)
print('--- IDF (Inverse Document Frequency) ---')
# 文書の数をカウントする
number_of_docs = len(corpus)
# その単語が一つでも含まれる文書の数をカウントする
number_of_docs_contain_word = np.count_nonzero(bow, axis=0)
# 単語の珍しさを計算する
idf = np.log(number_of_docs / number_of_docs_contain_word)
df3 = pd.DataFrame([idf],
                columns=count_vectorizer.get_feature_names())
df3

--- IDF (Inverse Document Frequency) ---


Unnamed: 0,accord,action,afternoon,ago,although,altitud,american,amid,announc,appetit,...,wed,wedg,wednesday,week,weekend,weekli,win,within,write,yield
0,3.69,3.0,3.69,3.69,3.0,3.0,3.69,2.3,3.69,3.69,...,3.69,3.69,3.69,1.2,3.0,2.59,3.69,3.69,3.0,3.69


In [35]:
df3.iloc[0][df3.iloc[0] > 3.5]

accord      3.69
afternoon   3.69
ago         3.69
american    3.69
announc     3.69
            ... 
wedg        3.69
wednesday   3.69
win         3.69
within      3.69
yield       3.69
Name: 0, Length: 153, dtype: float64

In [36]:
# TF-IDF を計算してるところ
print('--- TF-IDF ---')
# TF と IDF をかける
tfidf = tf * idf
df4 = pd.DataFrame(tfidf,
                    columns=count_vectorizer.get_feature_names())
df4.head()

--- TF-IDF ---


Unnamed: 0,accord,action,afternoon,ago,although,altitud,american,amid,announc,appetit,...,wed,wedg,wednesday,week,weekend,weekli,win,within,write,yield
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.26,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.09,0.0,0.19,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.16,0.0,0.0,...,0.0,0.0,0.0,0.0,0.21,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
df4.iloc[0][df4.iloc[0] > 0.2]

correct      0.23
data         0.23
disappoint   0.23
dollaryen    0.23
forc         0.23
held         0.23
ism          0.23
mix          0.23
payrol       0.23
touch        0.23
Name: 0, dtype: float64

In [41]:
max(df4.iloc[:])

'yield'

In [25]:
column = df3.columns

In [26]:
import time 

In [27]:
for x in column:
    print(x)

accord
action
afternoon
ago
although
altitud
american
amid
announc
appetit
asia
asian
asset
back
base
basi
bearish
bid
boost
bottom
bounc
breakout
built
bull
bullish
candl
carri
chart
close
comeback
comment
committe
consecut
continu
correct
current
daili
data
day
deal
decent
decis
despit
determin
direct
disappoint
dollar
dollaryen
domin
doubl
down
drop
ema
engulf
equiti
extend
fail
feder
fell
financi
finish
follow
fomc
forc
format
found
fresh
friday
fx
gain
global
gmt
greenback
group
head
headlin
held
high
holidaythin
hope
initi
intraday
investor
ism
larg
last
late
level
like
look
lost
low
lower
major
manag
market
mid10700
mix
modestli
monday
move
movement
multiday
narrowli
navig
near
new
news
next
nov
open
optim
outlook
overal
pair
pattern
payrol
pick
pip
place
point
polici
posit
post
potenti
press
price
print
produc
prompt
pull
quit
ralli
rate
rattl
reach
reaction
rebound
recent
recov
recoveri
remain
resist
retrac
retreat
revisit
rise
risk
riskier
robust
rose
saw
seen
selloff
session