### 前処理
https://towardsdatascience.com/nlp-for-beginners-cleaning-preprocessing-text-data-ae8e306bef0f

In [1]:
import numpy as np
import pandas as pd
import preprocessor as p
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

In [3]:
pd.options.display.float_format = '{:0.2f}'.format

In [4]:
coupas = []
path = "news.txt"
with open(path) as f:
    s = f.read()
s

"the corrective forces through dollar/yen were held up a touch on friday with mixed data points positive payrolls but disappointing ism daily market outlook on major. update time: 04 nov 2019 09:30gmt. usd/jpy - 108.37. dollar's selloff from last wednesday's high at usd/jpy had a very bearish last week. the pair produced an engulfing bearish weekly candle at the resistance of a double top. the recent although global risk headlines have been quite a few over the weekend, usd/jpy fails to extend its bounce off 100-day ema amid the initial the usd/jpy was closed at 108.169 after placing a high of 108.322 and a low of 107.884. overall movement for the pair remained bullish that intra-day market moving news and views update time: 04 nov 2019 04:30gmt usd/jpy - 108.23.. dollar moves narrowly in holiday-thinned the usd/jpy pair lost 50 pips last week and started to retrace its drop on monday. as of writing, the pair was trading at fresh session highs at following an upside breakout of a large

In [5]:
def remove_punctuation(text):
    no_punc = "".join([x for x in text if x not in string.punctuation])
    return no_punc

In [6]:
s = remove_punctuation(s)
s

'the corrective forces through dollaryen were held up a touch on friday with mixed data points positive payrolls but disappointing ism daily market outlook on major update time 04 nov 2019 0930gmt usdjpy  10837 dollars selloff from last wednesdays high at usdjpy had a very bearish last week the pair produced an engulfing bearish weekly candle at the resistance of a double top the recent although global risk headlines have been quite a few over the weekend usdjpy fails to extend its bounce off 100day ema amid the initial the usdjpy was closed at 108169 after placing a high of 108322 and a low of 107884 overall movement for the pair remained bullish that intraday market moving news and views update time 04 nov 2019 0430gmt usdjpy  10823 dollar moves narrowly in holidaythinned the usdjpy pair lost 50 pips last week and started to retrace its drop on monday as of writing the pair was trading at fresh session highs at following an upside breakout of a large head and shoulders bottom pattern

In [7]:
# instantiate tokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [8]:
s = tokenizer.tokenize(s.lower())
s

['the',
 'corrective',
 'forces',
 'through',
 'dollaryen',
 'were',
 'held',
 'up',
 'a',
 'touch',
 'on',
 'friday',
 'with',
 'mixed',
 'data',
 'points',
 'positive',
 'payrolls',
 'but',
 'disappointing',
 'ism',
 'daily',
 'market',
 'outlook',
 'on',
 'major',
 'update',
 'time',
 '04',
 'nov',
 '2019',
 '0930gmt',
 'usdjpy',
 '10837',
 'dollars',
 'selloff',
 'from',
 'last',
 'wednesdays',
 'high',
 'at',
 'usdjpy',
 'had',
 'a',
 'very',
 'bearish',
 'last',
 'week',
 'the',
 'pair',
 'produced',
 'an',
 'engulfing',
 'bearish',
 'weekly',
 'candle',
 'at',
 'the',
 'resistance',
 'of',
 'a',
 'double',
 'top',
 'the',
 'recent',
 'although',
 'global',
 'risk',
 'headlines',
 'have',
 'been',
 'quite',
 'a',
 'few',
 'over',
 'the',
 'weekend',
 'usdjpy',
 'fails',
 'to',
 'extend',
 'its',
 'bounce',
 'off',
 '100day',
 'ema',
 'amid',
 'the',
 'initial',
 'the',
 'usdjpy',
 'was',
 'closed',
 'at',
 '108169',
 'after',
 'placing',
 'a',
 'high',
 'of',
 '108322',
 'and',
 

In [9]:
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

In [10]:
s = remove_stopwords(s)
s

['corrective',
 'forces',
 'dollaryen',
 'held',
 'touch',
 'friday',
 'mixed',
 'data',
 'points',
 'positive',
 'payrolls',
 'disappointing',
 'ism',
 'daily',
 'market',
 'outlook',
 'major',
 'update',
 'time',
 '04',
 'nov',
 '2019',
 '0930gmt',
 'usdjpy',
 '10837',
 'dollars',
 'selloff',
 'last',
 'wednesdays',
 'high',
 'usdjpy',
 'bearish',
 'last',
 'week',
 'pair',
 'produced',
 'engulfing',
 'bearish',
 'weekly',
 'candle',
 'resistance',
 'double',
 'top',
 'recent',
 'although',
 'global',
 'risk',
 'headlines',
 'quite',
 'weekend',
 'usdjpy',
 'fails',
 'extend',
 'bounce',
 '100day',
 'ema',
 'amid',
 'initial',
 'usdjpy',
 'closed',
 '108169',
 'placing',
 'high',
 '108322',
 'low',
 '107884',
 'overall',
 'movement',
 'pair',
 'remained',
 'bullish',
 'intraday',
 'market',
 'moving',
 'news',
 'views',
 'update',
 'time',
 '04',
 'nov',
 '2019',
 '0430gmt',
 'usdjpy',
 '10823',
 'dollar',
 'moves',
 'narrowly',
 'holidaythinned',
 'usdjpy',
 'pair',
 'lost',
 '50',


In [11]:
# instantiate lemmatizer
lemmatizer = WordNetLemmatizer()

In [12]:
def word_lemmatizer(text):
    lem_text = [lemmatizer.lemmatize(x) for x in text]
    return lem_text

In [13]:
s = word_lemmatizer(s)
s

['corrective',
 'force',
 'dollaryen',
 'held',
 'touch',
 'friday',
 'mixed',
 'data',
 'point',
 'positive',
 'payroll',
 'disappointing',
 'ism',
 'daily',
 'market',
 'outlook',
 'major',
 'update',
 'time',
 '04',
 'nov',
 '2019',
 '0930gmt',
 'usdjpy',
 '10837',
 'dollar',
 'selloff',
 'last',
 'wednesday',
 'high',
 'usdjpy',
 'bearish',
 'last',
 'week',
 'pair',
 'produced',
 'engulfing',
 'bearish',
 'weekly',
 'candle',
 'resistance',
 'double',
 'top',
 'recent',
 'although',
 'global',
 'risk',
 'headline',
 'quite',
 'weekend',
 'usdjpy',
 'fails',
 'extend',
 'bounce',
 '100day',
 'ema',
 'amid',
 'initial',
 'usdjpy',
 'closed',
 '108169',
 'placing',
 'high',
 '108322',
 'low',
 '107884',
 'overall',
 'movement',
 'pair',
 'remained',
 'bullish',
 'intraday',
 'market',
 'moving',
 'news',
 'view',
 'update',
 'time',
 '04',
 'nov',
 '2019',
 '0430gmt',
 'usdjpy',
 '10823',
 'dollar',
 'move',
 'narrowly',
 'holidaythinned',
 'usdjpy',
 'pair',
 'lost',
 '50',
 'pip',


In [14]:
# instantiate stemmer
stemmer = PorterStemmer()

In [15]:
def word_stemmer(text):
    stem_txt = " ".join([stemmer.stem(x) for x in text])
    return stem_txt

In [16]:
s = word_stemmer(s)
s

'correct forc dollaryen held touch friday mix data point posit payrol disappoint ism daili market outlook major updat time 04 nov 2019 0930gmt usdjpi 10837 dollar selloff last wednesday high usdjpi bearish last week pair produc engulf bearish weekli candl resist doubl top recent although global risk headlin quit weekend usdjpi fail extend bounc 100day ema amid initi usdjpi close 108169 place high 108322 low 107884 overal movement pair remain bullish intraday market move news view updat time 04 nov 2019 0430gmt usdjpi 10823 dollar move narrowli holidaythin usdjpi pair lost 50 pip last week start retrac drop monday write pair trade fresh session high follow upsid breakout larg head shoulder bottom pattern sever week ago usdjpi pull back test support usdjpi pair fell last week finish 10816 recov modestli friday bottom 10788 found support base last week price action close 108191 direct usdjpi week like determin trader reaction support usdjpi look new direct tumultu week saw up down next ma

In [17]:
coupas = []
coupas.append(s)

### bag of word
https://blog.amedama.jp/entry/tf-idf

In [18]:
# 単語の数をカウントする
count_vectorizer = CountVectorizer()
bow = count_vectorizer.fit_transform(coupas).toarray()
bow

array([[ 2,  2,  1,  1,  1,  3,  1,  2,  1,  1,  1,  1,  1,  1,  2,  1,
         1,  1,  3,  2,  2,  1,  4,  1,  1,  1,  3,  1,  1,  1,  3,  1,
         4,  1,  1,  3,  3,  1,  1,  3,  1,  1,  2,  1,  1,  1,  1,  3,
         1,  4,  1,  1,  5,  1,  1,  1,  3,  2,  1,  1,  2,  3,  3,  1,
         1,  1,  1,  2,  2,  4,  5,  1,  1,  1,  4,  2,  8,  3,  1,  1,
         3,  4,  1,  1,  1,  9,  1,  2,  1,  1,  1,  2,  3,  4,  3,  1,
         1,  6,  2,  4,  2,  2,  1,  1,  1,  6,  2,  1,  2,  1,  1,  1,
         3,  1,  1,  8,  1,  2,  3,  1,  1,  3,  5,  1,  9,  3,  4,  2,
         1,  1,  2,  1,  5,  1,  2,  4,  2,  1,  4,  1, 10,  2,  2,  1,
         1,  2,  2,  1,  2,  3,  3,  1,  2,  2,  2,  1,  2,  2,  1,  2,
         1,  3,  1, 12,  2,  2,  3,  2,  1,  4,  2,  1,  1,  1,  1,  1,
         2,  1,  1,  4,  2,  2,  1,  1,  1,  2,  1,  7,  8,  4,  4,  1,
         3,  2,  1,  1,  3,  1,  1,  1,  4,  2,  2,  2,  9,  2,  1,  1,
         1,  1,  1,  1,  5,  2,  1,  1,  1,  5,  1,  2,  2,  2, 

In [19]:
print('--- BoW (Bag of Words) ---')
df1 = pd.DataFrame(bow,
                  columns=count_vectorizer.get_feature_names())
df1.head()

--- BoW (Bag of Words) ---


Unnamed: 0,015,04,0430gmt,05,08,0930gmt,0953,100day,10788,107884,...,weekend,weekli,weeksth,win,within,would,write,yen,yieldsusdjpi,zone
0,2,2,1,1,1,3,1,2,1,1,...,2,6,1,1,1,1,4,4,1,3


In [20]:
topWord = df1.iloc[0][df1.iloc[0] >= 10]
topWordf = pd.DataFrame({"counts":topWord})
topWordf

Unnamed: 0,counts
dollar,10
extend,12
last,17
move,10
pair,41
session,12
trade,20
usdjpi,63
week,16


In [21]:
topWordf.to_csv("bag_of_words.csv")