In [1]:
import pandas as pd
import json

from bs4 import BeautifulSoup
import re

In [2]:
dir = 'source/raw_trump_tweets_2.json'

df = pd.read_json(open(dir, 'r'))

In [3]:
df.head()

Unnamed: 0,created_at,text
0,"datetime.datetime(2018, 9, 26, 10, 57, 30)",Jobless Claims fell to their lowest level in 4...
1,"datetime.datetime(2018, 9, 26, 10, 54, 59)","Consumer confidence hits an 18 year high, clos..."
2,"datetime.datetime(2018, 9, 26, 2, 55, 20)",The Democrats are playing a high level CON GAM...
3,"datetime.datetime(2018, 9, 26, 2, 38, 54)",“These law enforcement people took the law int...
4,"datetime.datetime(2018, 9, 25, 22, 6, 33)",73rd Session of the United Nations General Ass...


In [4]:
# Trying out the lengths of the tweets:
df['pre_clean_len'] = [len(t) for t in df.text]
df['pre_clean_len'][:20]

0      54
1     140
2     139
3     114
4      81
5     140
6     121
7     137
8     119
9      47
10     39
11     23
12    139
13    139
14    109
15    140
16     22
17     23
18    140
19    140
Name: pre_clean_len, dtype: int64

In [5]:
# Trying the cleaning with an example tweet first.
df.text[3191]

"@ricardorossello ......#FakeNews critics are working overtime, but we're getting great marks from the people that truly matter! \n#PRStrong🇵🇷"

In [6]:
example1 = BeautifulSoup(df.text[3191], 'lxml')
print(example1.get_text())

@ricardorossello ......#FakeNews critics are working overtime, but we're getting great marks from the people that truly matter! 
#PRStrong🇵🇷


In [21]:
# Replacing links with "https"
print(df.text[1])
re.sub('https?://[A-Za-z0-9./]+','https',df.text[1])

Consumer confidence hits an 18 year high, close to breaking the all-time record. A big jump from last 8 years. Peop… https://t.co/ftDZQ7LWuu


'Consumer confidence hits an 18 year high, close to breaking the all-time record. A big jump from last 8 years. Peop… https'

In [25]:
df.text[20:30]

20    Joint Statement on the United States-Korea Fre...
21    Brett Kavanaugh and his wife, Ashley, will be ...
22    US-Korea Free Trade Agreement Signing Ceremony...
23    It was my great honor to welcome and meet with...
24    Today, we commit to fighting the drug epidemic...
25    “Remarks by President Trump at ‘Global Call to...
26    RT @SCEMD: Now is the time to put safety first...
27    Prime Minster @AbeShinzo is coming up to Trump...
28    Going to New York. Will be with Prime Minister...
29    Tiger is playing great. Looks like a big win c...
Name: text, dtype: object

In [23]:
testing = df.text[226]
testing

'Was just briefed via phone by @DHSgov @SecNielsen and @FEMA @FEMA_Brock, along with @VP Mike Pence and Chief of Sta… https://t.co/zh5cE2rfXA'

In [24]:
testing.replace(u"\ufffd", "?")

'Was just briefed via phone by @DHSgov @SecNielsen and @FEMA @FEMA_Brock, along with @VP Mike Pence and Chief of Sta… https://t.co/zh5cE2rfXA'

In [26]:
# Removing special characters, leaving only letters.
re.sub("[^a-zA-Z]", " ", df.text[3191])

' ricardorossello        FakeNews critics are working overtime  but we re getting great marks from the people that truly matter    PRStrong  '

In [33]:
# Actual method for cleaning the text
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))


def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, 'https', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped

    # Do we want to do something for links etc?
    
    letters_only = re.sub("[^a-zA-Z]", " ", stripped)
    lower_case = letters_only.lower()

    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()

In [34]:
# Testing with small amount of texts
testing = df.text[:10]
test_result = []
for t in testing:
    test_result.append(tweet_cleaner(t))
test_result

['jobless claims fell to their lowest level in years',
 'consumer confidence hits an year high close to breaking the all time record a big jump from last years peop https',
 'the democrats are playing a high level con game in their vicious effort to destroy a fine person it is called the https',
 'these law enforcement people took the law into their own hands when it came to president trump https',
 'rd session of the united nations general assembly unga https',
 'consumer confidence rose in september notching its highest level in about years the consumer board s index ro https',
 'remarks by president trump at a luncheon hosted by the secretary general of the united nations https',
 'remarks by president trump to the rd session of the united nations general assembly https https',
 'rush limbaugh to republicans you can kiss the midterms goodbye if you don t get highly qualified kavanaugh approved',
 'thank you dr jeffress https']

In [35]:
# Running the tweet_cleaner for the big data
print("Cleaning and parsing the tweets...\n")
clean_tweet_texts = []
for i in range(0, len(df)):
    if( (i+1)%1000 == 0 ):
        print(i+1, " tweets has been processed")                                                                  
    clean_tweet_texts.append(tweet_cleaner(df['text'][i]))

Cleaning and parsing the tweets...



  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that d

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


1000  tweets has been processed


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


2000  tweets has been processed
3000  tweets has been processed


In [36]:
# Saving clean texts into a dataframe with timestamps
clean_df = pd.DataFrame(clean_tweet_texts,columns=['text'])
clean_df['timestamp'] = df.created_at
clean_df.head()

Unnamed: 0,text,timestamp
0,jobless claims fell to their lowest level in y...,"datetime.datetime(2018, 9, 26, 10, 57, 30)"
1,consumer confidence hits an year high close to...,"datetime.datetime(2018, 9, 26, 10, 54, 59)"
2,the democrats are playing a high level con gam...,"datetime.datetime(2018, 9, 26, 2, 55, 20)"
3,these law enforcement people took the law into...,"datetime.datetime(2018, 9, 26, 2, 38, 54)"
4,rd session of the united nations general assem...,"datetime.datetime(2018, 9, 25, 22, 6, 33)"


In [37]:
len(clean_df)

3199