In [1]:
import numpy as np
import pandas as pd
import json
import ndjson
import os
import pdb

In [2]:
os.getcwd()

'C:\\Users\\PCCR\\SB'

In [3]:
# read the entire file into a python array
with open('realdonaldtrump.ndjson', encoding='utf8') as f:
    data = ndjson.load(f)


In [4]:
#place file into a dataframe for further exploratoritive use
df = pd.DataFrame(data)
#df['text'].apply(lambda x: len(x.split()))

# Clean the data by:

1. Removing all irrelevant characters such as any non alphanumeric characters
2. Tokenize the text by separating it into individual words
3. Remove words that are not relevant, such as “@” twitter mentions or urls
4. Convert all characters to lowercase, in order to treat words such as “hello”, “Hello”, and “HELLO” the same

5. Considering combining misspelled or alternately spelled words to a single representation (e.g. “cool”/”kewl”/”cooool”)
6. Considering lemmatization (reduce words such as “am”, “are”, and “is” to a common form such as “be”)

In [5]:
df.columns

Index(['contributors', 'coordinates', 'created_at', 'entities',
       'extended_entities', 'favorite_count', 'favorited', 'geo', 'id',
       'id_str', 'in_reply_to_screen_name', 'in_reply_to_status_id',
       'in_reply_to_status_id_str', 'in_reply_to_user_id',
       'in_reply_to_user_id_str', 'is_quote_status', 'lang', 'place',
       'possibly_sensitive', 'quoted_status', 'quoted_status_id',
       'quoted_status_id_str', 'retrieved_utc', 'retweet_count', 'retweeted',
       'retweeted_status', 'scopes', 'source', 'text', 'truncated', 'user',
       'withheld_copyright', 'withheld_in_countries', 'withheld_scope'],
      dtype='object')

In [6]:
df_new = pd.DataFrame(df, columns = ('retweet_count', 'text'))

In [7]:
df_new.head()


Unnamed: 0,retweet_count,text
0,501,Be sure to tune in and watch Donald Trump on L...
1,33,Donald Trump will be appearing on The View tom...
2,13,Donald Trump reads Top Ten Financial Tips on L...
3,12,New Blog Post: Celebrity Apprentice Finale and...
4,1422,"""My persona will never be that of a wallflower..."


data_per_tweet = {}
for i, r in enumerate(df_new):
    data[r] = pickle.load(file)

In [8]:
text_of_tweets = df['text']
text_of_tweets[2000]

'V.P.....really! http://t.co/psCtMgTM'

In [9]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text(txt):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    txt = txt.lower()
    txt = re.sub('\[.*?\]', '', txt)
    txt = re.sub('[%s]' % re.escape(string.punctuation), '', txt)
    txt = re.sub('\w*\d\w*', '', txt)
    txt = re.sub('\n', '', txt)
    return txt

round1 = lambda x: clean_text(x)

In [10]:
# The updated text
data_clean_first = pd.DataFrame(text_of_tweets.apply(round1))
data_clean_first.text[2000]


'vpreally httptcopsctmgtm'

In [11]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [12]:
# The updated text
data_clean_second = pd.DataFrame(data_clean_first.text.apply(round2))
data_clean_second.text[2000]

'vpreally httptcopsctmgtm'

In [13]:
# Apply a third round of cleaning
def clean_text_round3(text):
    '''Get rid of the http sites.'''
    text = re.sub('http\\w*', '', text)
    
    return text

round3 = lambda x: clean_text_round3(x)

In [14]:
# The updated text
data_clean_third = pd.DataFrame(data_clean_second.text.apply(round3))
data_clean_third.text[2000]

'vpreally '

In [15]:
# Create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix
import pickle

cv = CountVectorizer(stop_words='english')
#cv.fit_transform(data_clean_second.text).toarray()

#dir(cv.vocabulary)

data_dtm = pd.DataFrame(cv.fit_transform(data_clean_third.text).toarray(), columns=cv.get_feature_names())
#data_dtm
#data_dtm.index = data_clean_third.index


In [35]:
data_dtm

Unnamed: 0,aaa,aaafivediamond,aaanews,aaapgs,aaasquibby,aaceallaigh,aacrowellt,aalucero,aamp,aand,...,받고,북측을,브리핑을,오울렛,정상의,초소는,초소에서,한국전쟁,한미,ｒｔ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
print(cv.vocabulary_)



In [40]:
data_dtm.head()

Unnamed: 0,aaa,aaafivediamond,aaanews,aaapgs,aaasquibby,aaceallaigh,aacrowellt,aalucero,aamp,aand,...,북측을,브리핑을,오울렛,정상의,초소는,초소에서,한국전쟁,한미,ｒｔ,retweetd_count
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,501
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,33
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,13
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,12
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1422


In [35]:
sum(cv.transform(['here is a thing blabla trump melania']).toarray()[0])

3