### 前処理
https://towardsdatascience.com/nlp-for-beginners-cleaning-preprocessing-text-data-ae8e306bef0f

In [1]:
import numpy as np
import pandas as pd
import re
import preprocessor as p
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

In [3]:
pd.options.display.float_format = '{:0.2f}'.format

In [4]:
df = pd.read_csv('trumptweep.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,date,tweet
0,132,2019-07-06,RT @SteveScalise: .@realDonaldTrump continues...
1,131,2019-07-07,".....Comcast (NBC/MSNBC) Trump haters, who do..."
2,130,2019-07-08,....thought of within the U.S. We will no lon...
3,129,2019-07-09,RT @RepMarkMeadows: This was amazing news fro...
4,128,2019-07-10,Thank you Robert Johnson! https://t.co/kE4W2m...


In [5]:
del df['Unnamed: 0']

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133 entries, 0 to 132
Data columns (total 2 columns):
date     133 non-null object
tweet    133 non-null object
dtypes: object(2)
memory usage: 2.2+ KB


In [7]:
def remove_punctuation(text):
    no_punc = "".join([x for x in text if x not in string.punctuation])
    return no_punc

In [8]:
df['tweet'] = df['tweet'].apply(lambda x: remove_punctuation(x))
df.head()

Unnamed: 0,date,tweet
0,2019-07-06,RT SteveScalise realDonaldTrump continues to ...
1,2019-07-07,Comcast NBCMSNBC Trump haters who do whatever...
2,2019-07-08,thought of within the US We will no longer de...
3,2019-07-09,RT RepMarkMeadows This was amazing news from ...
4,2019-07-10,Thank you Robert Johnson httpstcokE4W2m9OoM M...


In [9]:
# instantiate tokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [10]:
df['tweet'] = df['tweet'].apply(lambda x: tokenizer.tokenize(x.lower()))
df.head()

Unnamed: 0,date,tweet
0,2019-07-06,"[rt, stevescalise, realdonaldtrump, continues,..."
1,2019-07-07,"[comcast, nbcmsnbc, trump, haters, who, do, wh..."
2,2019-07-08,"[thought, of, within, the, us, we, will, no, l..."
3,2019-07-09,"[rt, repmarkmeadows, this, was, amazing, news,..."
4,2019-07-10,"[thank, you, robert, johnson, httpstcoke4w2m9o..."


In [11]:
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

In [12]:
df['tweet'] = df['tweet'].apply(lambda x: remove_stopwords(x))
df.head()

Unnamed: 0,date,tweet
0,2019-07-06,"[rt, stevescalise, realdonaldtrump, continues,..."
1,2019-07-07,"[comcast, nbcmsnbc, trump, haters, whatever, b..."
2,2019-07-08,"[thought, within, us, longer, deal, good, news..."
3,2019-07-09,"[rt, repmarkmeadows, amazing, news, weekend, 2..."
4,2019-07-10,"[thank, robert, johnson, httpstcoke4w2m9oom, a..."


In [13]:
# instantiate lemmatizer
lemmatizer = WordNetLemmatizer()

In [14]:
def word_lemmatizer(text):
    lem_text = [lemmatizer.lemmatize(x) for x in text]
    return lem_text

In [15]:
df['tweet'] = df['tweet'].apply(lambda x: word_lemmatizer(x))
df.head()

Unnamed: 0,date,tweet
0,2019-07-06,"[rt, stevescalise, realdonaldtrump, continues,..."
1,2019-07-07,"[comcast, nbcmsnbc, trump, hater, whatever, br..."
2,2019-07-08,"[thought, within, u, longer, deal, good, news,..."
3,2019-07-09,"[rt, repmarkmeadows, amazing, news, weekend, 2..."
4,2019-07-10,"[thank, robert, johnson, httpstcoke4w2m9oom, a..."


In [16]:
# instantiate stemmer
stemmer = PorterStemmer()

In [17]:
def word_stemmer(text):
    stem_txt = " ".join([stemmer.stem(x) for x in text])
    return stem_txt

In [18]:
df['tweet'] = df['tweet'].apply(lambda x: word_stemmer(x))
df.head()

Unnamed: 0,date,tweet
0,2019-07-06,rt stevescalis realdonaldtrump continu defi ex...
1,2019-07-07,comcast nbcmsnbc trump hater whatev brian amp ...
2,2019-07-08,thought within u longer deal good news wonder ...
3,2019-07-09,rt repmarkmeadow amaz news weekend 224000 job ...
4,2019-07-10,thank robert johnson httpstcoke4w2m9oom admini...


In [19]:
# SentimentIntensityAnalyzer(pos, neg, neu)
pos = []
neg = []
neu = []
sid = SIA()
for x in df.tweet:
    ss = sid.polarity_scores(x)
    pos.append(ss['pos'])
    neg.append(ss['neg'])
    neu.append(ss['neu'])

### tf-idf
https://blog.amedama.jp/entry/tf-idf

In [20]:
corpus = df.tweet.as_matrix()
corpus = [p.clean(x).lower() for x in corpus if x not in 'rt']
corpus[0]

  """Entry point for launching an IPython kernel.


'rt stevescalis realdonaldtrump continu defi eect amp deliv result american worker democrat need elain eve rt stevescalis joe biden forget obama secretari state hillari clinton offer reset russia undermin nat rt stevescalis joe biden claim russian elect interfer happen watch obama watch serious joe joe biden reclam project thing salvag china countri rip u httpstco20xgevtfd6 rt wvgovernor today lost wv superstar lost close friend famili go back begin cline empir pi wonder man great republican httpstcozgcu3ebs4j great coach great guy httpstcotwab8u9ki jerri get presidenti medal freedom well deserv httpstco45yxturlqj democrat must chang loophol amp asylum law probabl want open border mean ma httpstco3htlj8ducf rt ericbol photo via b tessler httpstcozrsolcjqlp rt cbpsouthtexa cbp offic seiz m methamphetamin cocain laredo port entri read httpstcokj rt ambjohnbolton u proud support interim presid guaido amp democrat elect nation assembl courag rt scavino45 httpstcofj4nd2mqqw countri envi wor

In [21]:
# 単語の数をカウントする
count_vectorizer = CountVectorizer()
bow = count_vectorizer.fit_transform(corpus).toarray()
bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [22]:
print('--- BoW (Bag of Words) ---')
df1 = pd.DataFrame(bow,
                  columns=count_vectorizer.get_feature_names())

df1 = df1.iloc[:,:-7]
df1.head()

--- BoW (Bag of Words) ---


Unnamed: 0,aap,abaco,abandon,abc,abcpolit,abcwashington,abcworldnew,abdo,abe,abedin,...,zaino,zelenski,zero,zimmerman,zogbi,zon,zone,ztpetrizzo,zucker,zuckerberg
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [23]:
# TF を計算してるところ (行方向の処理)
print('--- TF (Term Frequency) ---')
# 文書に含まれる単語の数をカウントする
number_of_words = np.sum(bow, axis=1, keepdims=True)
# 文書の中での単語の頻度を計算する
tf = bow / number_of_words
df2 = pd.DataFrame(tf,
                  columns=count_vectorizer.get_feature_names())

df2 = df2.iloc[:,:-7]
df2.head()

--- TF (Term Frequency) ---


Unnamed: 0,aap,abaco,abandon,abc,abcpolit,abcwashington,abcworldnew,abdo,abe,abedin,...,zaino,zelenski,zero,zimmerman,zogbi,zon,zone,ztpetrizzo,zucker,zuckerberg
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# IDF を計算してるところ (列方向の処理)
print('--- IDF (Inverse Document Frequency) ---')
# 文書の数をカウントする
number_of_docs = len(corpus)
# その単語が一つでも含まれる文書の数をカウントする
number_of_docs_contain_word = np.count_nonzero(bow, axis=0)
# 単語の珍しさを計算する
idf = np.log(number_of_docs / number_of_docs_contain_word)
df3 = pd.DataFrame([idf],
                columns=count_vectorizer.get_feature_names())
df3 = df3.iloc[:,:-7]
df3.head()

--- IDF (Inverse Document Frequency) ---


Unnamed: 0,aap,abaco,abandon,abc,abcpolit,abcwashington,abcworldnew,abdo,abe,abedin,...,zaino,zelenski,zero,zimmerman,zogbi,zon,zone,ztpetrizzo,zucker,zuckerberg
0,3.79,4.89,4.89,3.28,4.2,4.89,4.89,4.89,4.89,4.89,...,4.89,3.1,2.49,4.2,4.89,4.89,3.79,4.89,4.89,4.89


In [25]:
# TF-IDF を計算してるところ
print('--- TF-IDF ---')
# TF と IDF をかける
tfidf = tf * idf
df4 = pd.DataFrame(tfidf,
                    columns=count_vectorizer.get_feature_names())
df4 = df4.iloc[:,:-7]
df4.head()

--- TF-IDF ---


Unnamed: 0,aap,abaco,abandon,abc,abcpolit,abcwashington,abcworldnew,abdo,abe,abedin,...,zaino,zelenski,zero,zimmerman,zogbi,zon,zone,ztpetrizzo,zucker,zuckerberg
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
FX_word = pd.read_csv('bag_of_words.csv')
FX_word['word'] = FX_word['Unnamed: 0']
del FX_word['Unnamed: 0']
new = ['japan', 'usd', 'jpy', 'abe', 'doller', 'yen', 'fx']
fx_word = list(FX_word.word) + new
fx_word

['last',
 'move',
 'pair',
 'session',
 'trade',
 'usdjpi',
 'week',
 'japan',
 'usd',
 'jpy',
 'abe',
 'doller',
 'yen',
 'fx']

In [26]:
column = list(df3.columns)

In [27]:
for x in column:
    for y in fx_word:
        if x == y:
            df4[str(y)][df4[str(y)] != 0] = 0.7
            print(y)

abe
japan
last
trade
week


In [28]:
df4.index = df['date']
df4['positive_pct'] = pos
df4['negative_pct'] = neg
df4['neutral_pct'] = neu
df4.head()

Unnamed: 0_level_0,aap,abaco,abandon,abc,abcpolit,abcwashington,abcworldnew,abdo,abe,abedin,...,zimmerman,zogbi,zon,zone,ztpetrizzo,zucker,zuckerberg,positive_pct,negative_pct,neutral_pct
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-07-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.23,0.04,0.72
2019-07-07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13,0.11,0.76
2019-07-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.18,0.14,0.68
2019-07-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.19,0.07,0.74
2019-07-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12,0.09,0.8


In [29]:
max(df4['positive_pct']), max(df4['negative_pct'])

(0.332, 0.235)

In [36]:
len(df4['positive_pct'][df4['positive_pct'] > 0.1]), len(df4['negative_pct'][df4['negative_pct'] > 0.1])

(114, 50)