In [1]:
import numpy as np
import pandas as pd
import json
import ndjson
import os
import pdb

In [2]:
os.getcwd()

'C:\\Users\\PCCR\\SB\\Capstone'

In [4]:
# set column width because of the max characters of 140 allowed in a tweet
pd.options.display.max_colwidth=150

In [5]:
# read the entire file into a python array
with open('realdonaldtrump.ndjson', encoding='utf8') as f:
    data = ndjson.load(f)


In [7]:
#place file into a dataframe for further exploratoritive use
df = pd.DataFrame(data)

In [10]:
# creating bool series True for NaN values  
bool_series = pd.isnull(df['retweet_count'])  
    
# filtering data
df[bool_series]  

Unnamed: 0,contributors,coordinates,created_at,entities,extended_entities,favorite_count,favorited,geo,id,id_str,...,retweeted,retweeted_status,scopes,source,text,truncated,user,withheld_copyright,withheld_in_countries,withheld_scope



# Clean the data by:

1. Removing all irrelevant characters such as any non alphanumeric characters
2. Tokenize the text by separating it into individual words
3. Remove words that are not relevant, such as “@” twitter mentions or urls
4. Convert all characters to lowercase, in order to treat words such as “hello”, “Hello”, and “HELLO” the same
5. Considering combining misspelled or alternately spelled words to a single representation (e.g. “cool”/”kewl”/”cooool”)
6. Considering lemmatization (reduce words such as “am”, “are”, and “is” to a common form such as “be”)

In [12]:
df_new = pd.DataFrame(df, columns = ('retweet_count', 'text'))

In [13]:
df_new.iloc[37329]


retweet_count                                                                                                                                          26771
text             ....People do not yet realize how much of the Wall, including really effective renovation, has already been built.… https://t.co/F3FHCcgD7U
Name: 37329, dtype: object

In [14]:
df_new.iloc[37329].retweet_count

26771

In [15]:
text_of_tweets = df_new['text']
text_of_tweets[20340 : 20400]

20340              "@JordanWells119: I forgot how much I love #TheApprentice and Trump, so glad I'm watching this season! All hail king @realDonaldTrump"
20341                                              "@caejh: Got out of class and immediately turned Donald Trump on as loud as I could. It's the weekend"
20342                                                                         "@uf9606084212012: @realDonaldTrump please save us run #savior got my vote"
20343                                                                                         "@Jackshallis: @realDonaldTrump Donald Trump for president"
20344                                                                              "@ZyeZHE: @realDonaldTrump @MissUniverse Top 6 http://t.co/7Z5o8eCGa8"
20345        "@BJKizer74:DonaldTrump Why do we keep putting the same people on the Republican ticket for POTUS? Romney? Bush? Cruz? Paul? #YourTimeIsNow"
20346                                  "@Gilmore747: Miss Universe Top Conte

In [16]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text(txt):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    txt = txt.lower()
    txt = re.sub('\[.*?@\#]', ' ', txt)
    txt = re.sub('[%s]' % re.escape(string.punctuation), '', txt)
    txt = re.sub('\w*\d\w*', ' ', txt)
    txt = re.sub('\n', ' ', txt)
    return txt

round1 = lambda x: clean_text(x)

In [18]:
# The updated text
data_clean_first = pd.DataFrame(text_of_tweets.apply(round1))
data_clean_first.text[20340:20350]
#data_clean_first.head()



20340      i forgot how much i love theapprentice and trump so glad im watching this season all hail king realdonaldtrump
20341                    caejh got out of class and immediately turned donald trump on as loud as i could its the weekend
20342                                                               realdonaldtrump please save us run savior got my vote
20343                                                              jackshallis realdonaldtrump donald trump for president
20344                                                                         zyezhe realdonaldtrump missuniverse top    
20345       why do we keep putting the same people on the republican ticket for potus romney bush cruz paul yourtimeisnow
20346                                         miss universe top contender this year y or n realdonaldtrump missuniverse  
20347                                              patricijabelous this man is our everyting realdonaldtrump trumpdoral  
20348                   

In [19]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', ' ', text)
    text = re.sub('\n', ' ', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [21]:
# The updated text
data_clean_second = pd.DataFrame(data_clean_first.text.apply(round2))
data_clean_second.text[40030:40035]
#data_clean_second.head()

40030        i am in south korea now president moon and i have  toasted  our new trade deal a far better one for us than that   
40031        the highly respected farm journal has just announced my approval rating with our great farmers at   and that des   
40032         the leaders of virtually every country that i met at the   congratulated me on our great economy many countries   
40033    rt thebluehousekr realdonaldtrump   hellopolicy mofakr secpompeo 오울렛 초소에서 브리핑을 받고 북측을 바라보며 대화하는 한미 정상의 모습 오울렛 초소는 한국전쟁 
40034       leaving south korea after a wonderful meeting with chairman kim jong un stood on the soil of north korea an impor   
Name: text, dtype: object

In [22]:
# Apply a third round of cleaning
def clean_text_round3(text):
    '''Get rid of the http sites.'''
    text = re.sub('http\\w*', ' ', text)
    
    return text

round3 = lambda x: clean_text_round3(x)

In [23]:
# The updated text
data_clean_third = pd.DataFrame(data_clean_second.text.apply(round3))
data_clean_third.text[34757]

'rt abeshinzo フロリダに到着し、早速トランプ大統領との首脳会談に臨みました。今日は、大半を北朝鮮問題に費やし、非常に重要な点で認識を一致させることができました。 「日本のために最善となるようベストを尽くす」 トランプ大統領は、来る米朝首脳会談で拉致問題を取り上げ '

In [24]:
# Create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix
import pickle

cv = CountVectorizer(stop_words='english')
data_dtm = pd.DataFrame(cv.fit_transform(data_clean_third.text).toarray(), columns=cv.get_feature_names())

In [25]:
list_for_drop = cv.get_feature_names()[-72:-1]


In [27]:
data_clean_third.text[34757]

'rt abeshinzo フロリダに到着し、早速トランプ大統領との首脳会談に臨みました。今日は、大半を北朝鮮問題に費やし、非常に重要な点で認識を一致させることができました。 「日本のために最善となるようベストを尽くす」 トランプ大統領は、来る米朝首脳会談で拉致問題を取り上げ '

In [28]:
number_for_drop = []
for i in range(len(data_clean_third.text)-1):
    for sign in list_for_drop:
        if re.search(sign, data_clean_third.text[i]):
            print(i, '', sign)
            number_for_drop.append(i)
            


33681  ありがとうございます
33697  そして
33697  アジア歴訪の大成功をお祈りしています
33697  トランプ大統領
33697  トランプ大統領による
33697  ドナルド
33697  初の
33697  日米同盟の揺るぎない絆を世界に示すことができました
33697  本当にありがとう
33697  歴史的な日本訪問は
33697  間違いなく
34757  トランプ大統領
34757  トランプ大統領は
34757  フロリダに到着し
34757  今日は
34757  大半を北朝鮮問題に費やし
34757  日本のために最善となるようベストを尽くす
34757  早速トランプ大統領との首脳会談に臨みました
34757  来る米朝首脳会談で拉致問題を取り上げ
34757  非常に重要な点で認識を一致させることができました
37239  この機会を活かし
37239  という共通の目標に向かって
37239  トランプ大統領
37239  モディ首相と
37239  初めてとなる日米印三か国による首脳会談を行いました
37239  緊密に連携していくことで一致しました
37239  自由で開かれたインド太平洋
37956  رژیم
37956  سال
37956  سرکوب
37956  شده
37956  فساد
37956  فقط
37956  مدتهاست
37956  مردم
37956  موجب
37956  چهلسالشکست
37956  که
38903  そして
38903  そして本日のゴルフと
38903  北朝鮮問題への対応
38903  昨日の首脳会談
38903  更には世界情勢に至るまで
38903  本日
38903  様々な課題についてじっくりと話をすることができました
38903  経済
39505  トランプ大統領
39505  日本へようこそ
39507  トランプ大統領
39507  令和初の国賓としてお迎えしたトランプ大統領と千葉でゴルフです
39507  初の
39507  新しい令和の時代も日米同盟をさらに揺るぎないものとしていきたいと考えています
39515  トランプ大統領
39515  トランプ大統領は
39515  トランプ大統領は安倍首相と共に皇居での歓迎式典

In [21]:
len(number_for_drop)

83