# 2. Cleaning tweets

Daniel Ruiz, MSc in Data Science and Business Analytics (DSBA), Bocconi University

Reference codes (alphabetically):
- https://www.kaggle.com/eliasdabbas/how-to-create-a-regex-to-extract-emoji
- https://www.nltk.org/api/nltk.tokenize.html#module-nltk.tokenize
- https://www.nltk.org/howto/portuguese_en.html
- https://py-googletrans.readthedocs.io/en/latest/#googletrans.models.Detected
- https://stackoverflow.com/questions/49498801/python-googletrans-library-no-json-object-could-be-decoded
- https://towardsdatascience.com/creating-the-twitter-sentiment-analysis-program-in-python-with-naive-bayes-classification-672e5589a7ed

## 2.1. Loading packages and hard data

In [1]:
# general
import pandas as pd
import numpy as np
import re
import string
import time

# language identification
from langdetect import detect

# stemmer
from nltk import bigrams
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize

## 2.2. Defining funtions

### Tweet Basic Information
- Number of characters: Counting tweet characters is not as immediate as one could think. Tweepy returns a string that has many special characters, including the textual representation of line breaks, emojis, etc. Counting these characters would create an upward bias to the number of characters. To count the 'true' number of characters, it is neccessary to convert everything to 'traditional' characters.

In [2]:
def extract_hashtags(tweet,not_this=''):
    # input = tweet = string
    # output = list of strings
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', r' ', tweet)
    return [i  for i in tweet.split() if i.startswith("#")]

def count_characters(tweet):
    
    # our number is biased downwards
    
    # images and URLs have the same structure (a link)
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', r' ', tweet)
    
    # RT
    tweet = re.sub('^rt @[^\s]+', r' ', tweet)
    
    # usernames -> disappear
    tweet = re.sub('@[^\s]+', r' ', tweet)

    # Emoji patterns
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    
    tweet = re.sub(emoji_pattern,r' ',tweet)
    
    # Retain only alpha-numeric characters, underline and dash (e.g. punctuation)
    tweet = re.sub(r'[^A-Za-z0-9]+', ' ', tweet)
    
    return len(tweet)

def count_tokens(tweet):
    return len(tweet.split())

def is_retweet(tweet):
    if re.match('^rt @[^\s]+',tweet)!=None:
        return 1
    return 0

def extract_bigrams(words):
    
    # input = list of strings
    
    extracted = []
    
    if len(words)<2:
        return extracted
    else:
        for i in range(1,len(words)):
            extracted.append(words[i-1]+'_'+words[i])

    return extracted

def remove_pt_accents(tweet):

    tweet = re.sub('(À)|(à)|(Á)|(á)|(Â)|(â)|(Ã)|(ã)', 'a', tweet)
    tweet = re.sub('(É)|(é)|(Ê)|(ê)', 'e', tweet)
    tweet = re.sub('(Í)|(í)', 'i', tweet)
    tweet = re.sub('(Ó)|(ó)|(Ô)|(ô)|(Õ)|(õ)', 'o', tweet)
    tweet = re.sub('(Ú)|(ú)|(Ü)|(ü)', 'u', tweet)
    tweet = re.sub('(Ç)|(ç)', 'c', tweet)
    tweet = re.sub('(Ñ)|(ñ)', 'n', tweet)
    
    return tweet

## 2.3. Processing the raw database

1. PreCleaner
        First, we open the raw files and remove duplicates that might have been generated during the scrapping process. We know whether a tweet has been scrapped two times because each tweet has an unique ID.

2. TweetCleaner
        Second, we clean the data and apply the functions created in the previous section.

3. TweetStemmer
        Third, we reduce words to stems using the snowball stemmer. NLTK contains lemmatization in English, but not in Portuguese. Thus, we've preferred to run only the stemming (and not lemmatization), as it is available on both languages (and the other is not).

4. TweetLanguage
        Fourth, we detect the language of the tweet using a preliminary output of the Cleaner. THis function comes last because it takes more time.

5. TweetFilter
        Fifth, we filter tweets that are presumably non-informative. The filters parameters are hard-codes extracted from TweetCollectiveAnalysis. Particularly, we filter: (i) users that look fake (less than 50 followers) and (ii) tweets with 0 stems.

In [3]:
def TweetPreCleaner(company):
    
    # names columns
    # remove duplicates
    # all to lower-case
    # remove usernames, urls
    # saves in Clean_01
    
    # timing (start)
    start_time = time.time()

    # open and read file
    filename='Dataset_Twitter/'+company[0]+'.csv'
    df = pd.read_csv(filename,header=None)
    print(company[0])
    
    if len(df.columns)==5:
        df.columns = ['text','id','datetime','user_name','user_followers']
    else:
        df.columns = ['text','id','datetime','user_name','user_followers','sentiment_pos','query_used']

    # remove duplicates
    print(df.shape)
    df = df.drop_duplicates('id')
    print(df.shape)

    # this will be useful later on
    df['text']=df.apply(lambda x: x.text.lower(), axis=1)
    df['user_name']=df.apply(lambda x: x.user_name.lower(), axis=1)
    
    # save in new folder
    df.to_pickle('Dataset_Twitter_Clean_01/'+company[0]+'.pkl')
    
    # timing (end)
    t_sec = round(time.time() - start_time)
    (t_min, t_sec) = divmod(t_sec,60)
    (t_hour,t_min) = divmod(t_min,60) 
    print('Time passed: {}hour:{}min:{}sec'.format(t_hour,t_min,t_sec))
    print('------------------')

In [4]:
x = "boeing is building a brand new 747 air force one for future presidents, but costs are out of control, more than $4 billion. cancel order!"

In [12]:
def TweetCleaner(company):
    
    # emoticons
    # Happy emoticons -> , removed because of confusion
    emoticons_happy = set([
        ':-)', ':)', ';)', ':o)', ':]',
        ':3', ':c)', ':>', ':}','8)',
        '=)', '=]', ':^)', ':-d', ':d',
        '8-d', '8d',':b','d:',
        '=-d', '=d','=-3', '=3',
        ':-))', ":'-)", ":')", ':*', ':^*',
        '>:p', ':-p', ':p', ':-p', ':p',
        '=p', ':-b', ':b', '>:)', '>;)',
        '>:-)','<3','xp','xd','x-p','x-d'
        ])

    # Sad emoticons
    emoticons_sad = set([
        ':L', ':-/', '>:/', ':S', '>:[',
        ':@', ':-(', ':[', ':-||', '=l',
        ':<', ':-[', ':-<', '=\\', '=/',
        '>:(', ':(', '>.<', ":'-(", ":'(",
        ':\\', ':-c', ':c', ':{', '>:\\',
        ';('
        ])

    # Emoji patterns
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)

    tweets = [company]
    
    for tweet in tweets:
        # urls, usernames -> disappear
        tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', ' ', tweet)
        tweet = re.sub('@[^\s]+', ' ', tweet)
        
        # symbols  -> ' '
        tweet = re.sub(emoji_pattern, ' ',tweet) # emojis
        tweet = re.sub('#([^\s]+)', '\\1', tweet) # hash symbol
        tweet = re.sub('via twitter', ' ', tweet) # 'via Twitter'
        tweet = re.sub('\\brt\\b', ' ', tweet) # RT
        tweet = re.sub('\\b[0-9]+\\b\\s*', ' ', tweet) # number-only words
        tweet = re.sub('‚Ä¶', ' ', tweet) # mentions
    
        # portuguese characters -> remove special signs
        tweet = remove_pt_accents(tweet)

        # Retain only alpha-numeric characters (e.g. punctuation)
        tweet = re.sub(r"'s ", ' ', tweet)
        tweet = re.sub(r"'re ", ' ', tweet)
        tweet = re.sub(r"'ve ", ' ', tweet)
        tweet = re.sub(r"'", '', tweet)
        tweet = re.sub(r'[^A-Za-z0-9]+', ' ', tweet)

        # tokenize (i.e. repeated characters)
        tweet = word_tokenize(tweet)

        # stopwords
        comp_lang = 'us'
        if comp_lang == 'br':
            stop_xx = stopwords.words('portuguese') + ['pra','voc','vc']
            stop_xx.remove('não')
            stop_xx.remove('nem')
            stop_xx = set([remove_pt_accents(word) for word in stop_xx])
            negations=set(['nao','nem','n'])

        else:
            stop_xx = stopwords.words('english')[:143] + []
            stop_xx.remove("don")
            stop_xx.remove("don't")
            stop_xx.remove("nor")
            stop_xx.remove("no")
            stop_xx.remove("not")
            stop_xx = [re.sub(r"'", '', word) for word in stop_xx]
            stop_XX = set(stop_xx)
            negations = set(stopwords.words('english')[143:]+["don","dont","not","nor","no"])
            
        tweet = [word for word in tweet if (len(word)>1 or word=='n')]
        tweet = [word for word in tweet if (word not in stop_xx)]
        
        print(tweet)
    
TweetCleaner(x)

['boeing', 'building', 'brand', 'new', 'air', 'force', 'one', 'future', 'presidents', 'costs', 'control', 'billion', 'cancel', 'order']


In [4]:
def TweetCleaner(company):
    
    # extracts info and pre-process text
    
    # timing (start)
    start_time = time.time()

    # open and read file
    filename='Dataset_Twitter_Clean_01/'+company[0]+'.pkl'
    df = pd.read_pickle(filename)
    print(company[0])

    # local variables
    comp_lang = company[0][:2]
    comp_name = company[1].lower()[1:]
    comp_hash = company[2].lower()

    # extract info
    df['hashtags']=df.apply(lambda x: extract_hashtags(x.text, comp_hash), axis=1)
    df['is_retweet']=df.apply(lambda x: is_retweet(x.text), axis=1)
    
    # emoticons
    # Happy emoticons -> , removed because of confusion
    emoticons_happy = set([
        ':-)', ':)', ';)', ':o)', ':]',
        ':3', ':c)', ':>', ':}','8)',
        '=)', '=]', ':^)', ':-d', ':d',
        '8-d', '8d',':b','d:',
        '=-d', '=d','=-3', '=3',
        ':-))', ":'-)", ":')", ':*', ':^*',
        '>:p', ':-p', ':p', ':-p', ':p',
        '=p', ':-b', ':b', '>:)', '>;)',
        '>:-)','<3','xp','xd','x-p','x-d'
        ])

    # Sad emoticons
    emoticons_sad = set([
        ':L', ':-/', '>:/', ':S', '>:[',
        ':@', ':-(', ':[', ':-||', '=l',
        ':<', ':-[', ':-<', '=\\', '=/',
        '>:(', ':(', '>.<', ":'-(", ":'(",
        ':\\', ':-c', ':c', ':{', '>:\\',
        ';('
        ])

    # Emoji patterns
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)

    # new variables
    has_happy = []
    has_sad = []
    text_lang_detect=[]
    text_clean = []
    
    # loop (data struture = list for speed)
    user_i=0
    user_name = df['user_name'].tolist()
    tweets = df['text'].tolist()
    
    for tweet in tweets:
        # urls, usernames -> disappear
        tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', ' ', tweet)
        tweet = re.sub('@[^\s]+', ' ', tweet)
        
        # emoticons -> remove (but record in variable)

        tweet_sup = []
        happy = 0
        sad = 0
        
        tweet = tweet.split()
        for token in tweet:
            if token in emoticons_happy:
                tweet_sup.append('')
                happy+=1
            elif token in emoticons_sad:
                tweet_sup.append('')
                sad+=1
            else:
                tweet_sup.append(token)
        
        tweet = " ".join(tweet_sup)
        has_happy.append(happy)
        has_sad.append(sad)
        
        # symbols  -> ' '
        tweet = re.sub(emoji_pattern, ' ',tweet) # emojis
        tweet = re.sub('#([^\s]+)', '\\1', tweet) # hash symbol
        tweet = re.sub('via twitter', ' ', tweet) # 'via Twitter'
        tweet = re.sub('\\brt\\b', ' ', tweet) # RT
        tweet = re.sub('\\b[0-9]+\\b\\s*', ' ', tweet) # number-only words
        tweet = re.sub('‚Ä¶', ' ', tweet) # mentions
        tweet = re.sub(user_name[user_i], ' ', tweet) # poster's username
        tweet = re.sub(comp_name, ' ', tweet) # company's name
        
        # create intermediary output for language detection
        text_lang_detect.append(tweet)
    
        # portuguese characters -> remove special signs
        tweet = remove_pt_accents(tweet)

        # Retain only alpha-numeric characters (e.g. punctuation)
        tweet = re.sub(r"'s ", ' ', tweet)
        tweet = re.sub(r"'re ", ' ', tweet)
        tweet = re.sub(r"'ve ", ' ', tweet)
        tweet = re.sub(r"'", '', tweet)
        tweet = re.sub(r'[^A-Za-z0-9]+', ' ', tweet)

        # tokenize (i.e. repeated characters)
        tweet = word_tokenize(tweet)

        # stopwords
        if comp_lang == 'br':
            stop_xx = stopwords.words('portuguese') + ['pra','voc','vc']
            stop_xx.remove('não')
            stop_xx.remove('nem')
            stop_xx = set([remove_pt_accents(word) for word in stop_xx])
            negations=set(['nao','nem','n'])

        else:
            stop_xx = stopwords.words('english')[:143] + []
            stop_xx.remove("don")
            stop_xx.remove("don't")
            stop_xx.remove("nor")
            stop_xx.remove("no")
            stop_xx.remove("not")
            stop_xx = [re.sub(r"'", '', word) for word in stop_xx]
            stop_XX = set(stop_xx)
            negations = set(stopwords.words('english')[143:]+["don","dont","not","nor","no"])
            
        tweet = [word for word in tweet if (len(word)>1 or word=='n')]
        tweet = [word for word in tweet if (word not in stop_xx)]
        
        # negations handling
        if (len(tweet)>=1) and (tweet[-1] in negations):
            tweet[-1]=='neg_'
            
        if len(tweet)>=2:
            for i in range(len(tweet)-2,-1,-1):
                if tweet[i] in negations:
                    tweet.pop(i)
                    tweet[i] ='neg_'+tweet[i]
                    if tweet[i][:8] == 'neg_neg_':
                        tweet[i] = tweet[i][8:]
                        
        tweet = [word for word in tweet if word!='']


        # clean list
        text_clean.append(tweet)
        
        user_i+=1
    
    # add to dataframe
    df['has_happy']=has_happy
    df['has_sad']=has_sad
    df['text_lang_detect']=text_lang_detect
    df['text_clean']=text_clean
    
    # save in new folder
    df.to_pickle('Dataset_Twitter_Clean_01/'+company[0]+'.pkl')
    
    # timing (end)
    t_sec = round(time.time() - start_time)
    (t_min, t_sec) = divmod(t_sec,60)
    (t_hour,t_min) = divmod(t_min,60) 
    print('Time passed: {}hour:{}min:{}sec'.format(t_hour,t_min,t_sec))
    print('------------------')

In [13]:
def TweetStemmer(company):
    
    # generate stems, create bigtams
    
    # timing (start)
    start_time = time.time()
    
    # open and read file
    filename='Dataset_Twitter_Clean_01/'+company[0]+'.pkl'
    df = pd.read_pickle(filename)
    print(company[0])

    # stemmer language
    
    if company[0][:2] == 'br':
        stemmer=SnowballStemmer('portuguese')
    else:
        stemmer=SnowballStemmer('english')
    
    tweets = df.text_clean.tolist()
    for i in range(len(tweets)):
        tweets[i]=[stemmer.stem(word) for word in tweets[i]]
    df['snowball_stems']=tweets
    
    # extract information
    
    # text
    df['count_char']=df.apply(lambda x: count_characters(x.text), axis=1)
    df['count_tokens']=df.apply(lambda x: count_characters(x.text), axis=1)
    # text_clean
    df['count_words']=df.apply(lambda x: len(x.text_clean), axis=1)
    df['count_stems']=df.apply(lambda x: len(x.snowball_stems), axis=1)
    
    # add bigrams
    bag1 = []
    bag2 = []
    
    tweets = df.text_clean.tolist()
    stems = df.snowball_stems.tolist()
    
    for i in range(len(tweets)):
        bag1.append(list(extract_bigrams(tweets[i])))
        bag2.append(list(extract_bigrams(stems[i])))
    
    df['text_clean_bigrams']=bag1
    df['stems_bigrams']=bag2
    df['words_unigrams_bigrams']=df['text_clean']+df['text_clean_bigrams']
    df['stems_unigrams_bigrams']=df['snowball_stems']+df['stems_bigrams']
    
    # save in new folder
    df.to_pickle('Dataset_Twitter_Clean_01/'+company[0]+'.pkl')
    
    # timing (end)
    t_sec = round(time.time() - start_time)
    (t_min, t_sec) = divmod(t_sec,60)
    (t_hour,t_min) = divmod(t_min,60) 
    print('Time passed: {}hour:{}min:{}sec'.format(t_hour,t_min,t_sec))
    print('------------------')

In [17]:
stemmer=SnowballStemmer('english')

In [18]:
y = ['building', 'brand', 'new', 'air', 'force', 'one', 'future', 'presidents', 'costs', 'control', 'billion', 'cancel', 'order']

In [19]:
[stemmer.stem(word) for word in y]

['build',
 'brand',
 'new',
 'air',
 'forc',
 'one',
 'futur',
 'presid',
 'cost',
 'control',
 'billion',
 'cancel',
 'order']

In [6]:
def TweetLanguage(company):
    
    # detects the language
    
    # timing (start)
    start_time = time.time()

    # open and read file
    filename='Dataset_Twitter_Clean_01/'+company[0]+'.pkl'
    df = pd.read_pickle(filename)
    print(company[0])

    # language
    tweet_lang = []
    tweet_en = []
    tweet_pt = []
    
    tweets = df.text_lang_detect.tolist()

    for tweet in tweets:
    
        try:
            lang = detect(tweet)
            if lang=='pt':
                pt=1
                en=0
            elif lang=='en':
                pt=0
                en=1
            else:
                pt=0
                en=0
        except:
            lang = 'xx'
            pt = 0
            en = 0
            
        tweet_pt.append(pt)
        tweet_en.append(en)
        tweet_lang.append(lang)
    
    df['tweet_lang']=tweet_lang
    df['tweet_en']=tweet_en
    df['tweet_pt']=tweet_pt
        
    # save in new folder
    #df.to_csv('Dataset_Twitter_Clean_02\\'+company[0]+'.csv',index=None)
    df.to_pickle('Dataset_Twitter_Clean_02/'+company[0]+'.pkl')
        
    # timing (end)
    t_sec = round(time.time() - start_time)
    (t_min, t_sec) = divmod(t_sec,60)
    (t_hour,t_min) = divmod(t_min,60) 
    print('Time passed: {}hour:{}min:{}sec'.format(t_hour,t_min,t_sec))
    print('------------------')
    
    
# alternatives packages -> overall, unavailable for large datasets
# alternative 1
#import spacy
#from spacy_langdetect import LanguageDetector

#def language_spacy(tweet):
#    nlp = spacy.load("en")
#    nlp.add_pipe(LanguageDetector(), name="language_detector", last=True)
#    doc = nlp(tweet)
#    return doc._.language['language']

#tweet = "hello my dear friend"
#print(language_spacy(tweet))

# alternative 2
#from textblob import TextBlob

#tweet = "hello dear friend"
#print(TextBlob(tweet).detect_language())

# alternative 3
#from googletrans import Translator

#translator=Translator()

#tweet = "hello my dear friend"
#detected = translator.detect(tweet)
#langu = detected.lang
#proba = detected.confidence
#print(langu,proba)

In [7]:
def TweetFilter(company):
    
    # timing (start)
    start_time = time.time()

    # open and read file
    filename='Dataset_Twitter_Clean_02/'+company[0]+'.pkl'
    df = pd.read_pickle(filename)
    print(company[0])
    
    # predominant language
    print('Filter: language == {}'.format(company[0][:2]))
    print(df.shape)
    
    if company[0][:2]=='br':
        df=df[df.tweet_pt==1]
    else:
        df=df[df.tweet_en==1]
        
    print(df.shape)
    
    # non informative tweets
    print('Filter: at least one stem')
    print(df.shape)
    df = df[df.count_stems>0]
    print(df.shape)
    
    # users that look like spam
    print('Filter: followers >= 40 (not spam-like)')
    print(df.shape)
    df = df[df.user_followers>=40]
    print(df.shape)    
    
    # save in new folder
    df.to_pickle('Dataset_Twitter_Clean_03/'+company[0]+'.pkl')
    
    # timing (end)
    t_sec = round(time.time() - start_time)
    (t_min, t_sec) = divmod(t_sec,60)
    (t_hour,t_min) = divmod(t_min,60) 
    print('Time passed: {}hour:{}min:{}sec'.format(t_hour,t_min,t_sec))
    print('------------------')

## Processing the data

In [9]:
my_companies = [['br_embraer','@EMBRAER','#EMBRAER'],
                ['br_americanas','@LOJASAMERICANAS','#LOJASAMERICANAS'],
                ['br_pontofrio','@PONTOFRIO','#PONTOFRIO'],
                ['br_petrobras','@PETROBRAS','#PETROBRAS'],
                ['br_bradesco','@BRADESCO','#BRADESCO'],
                ['br_itau','@ITAU','#ITAU'],
                ['br_renner','@LOJAS_RENNER','#RENNER'],
                ['br_gol','@VOEGOLOFICIAL','#VOEGOL'],
                ['br_magazineluiza','@MAGAZINELUIZA','#MAGALU'],
                ['br_valor','@VALORECONOMICO','#VALORECONOMICO'],
                ['us_abercrombie','@ABERCROMBIE','#ABERCROMBIE'],
                ['us_boeing','@BOEING','#BOEING'],
                ['us_beyondmeat','@BEYONDMEAT','#BEYONDMEAT'],
                ['us_morganstanley','@MORGANSTANLEY','#MORGANSTANLEY'],
                ['us_jpmorgan','@JPMORGAN','#JPMORGAN'],
                ['us_exxonmobil','@EXXONMOBIL','#EXXON'],
                ['us_americanair','@AMERICANAIR','#AMERICANAIRLINES'],
                ['us_cocacola','@COCACOLA','#COCACOLA'],
                ['us_tesla','@TESLA','#TESLA'],
                ['us_wsj','@WSJ','#WSJ']]

my_companies.append(['br_PortugueseTweets','@PORTUGUESE_TWEETS','#PORTUGUESE_TWEETS'])
my_companies.append(['us_EnglishTweets','@ENGLISH_TWEETS','#ENGLISH_TWEETS'])

################################################
print('Tweet Pre-Cleaner')
for company in my_companies:
    TweetPreCleaner(company)

# -> save in clean_01

print('Tweet Cleaner')
for company in my_companies:
    TweetCleaner(company)

# -> save in clean_01

print('Tweet Stemmer')
for company in my_companies:
    TweetStemmer(company)
    
# -> save in clean_01

print('Tweet Language')
for company in my_companies:
    TweetLanguage(company)

# -> save in clean_02

##### ANALYSIS OF 2.2 IS RUN HERE #####

print('Tweet Filter')
for company in my_companies:
    TweetFilter(company)

# -> save in clean_03

##### GRAPHS OF 2.3 ARE GENERATED HERE #####

Tweet Pre-Cleaner
us_wsj
(1962643, 5)
(1962643, 5)


AttributeError: ("'float' object has no attribute 'lower'", 'occurred at index 731864')