# Clean and Preprocess Twitter Data

In [1]:
import pandas as pd

In [2]:
pre_h = pd.read_csv('twitter_data/pre_h.csv', index_col=0, lineterminator='\n')
in_h = pd.read_csv('twitter_data/in_h.csv', index_col=0, lineterminator='\n')
post_h = pd.read_csv('twitter_data/post_h.csv', index_col=0, lineterminator='\n')

In [3]:
pre_h.shape

(111225, 8)

In [64]:
pre_h.head()

Unnamed: 0_level_0,created_at,full_text,user_id_str,geo,place,lang,retweet_count,favorite_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1431845676741234688,Sun Aug 29 05:06:26 +0000 2021,"Via @TropicalTidbits, latest RECON mission sho...",1371119451454763010,,,en,2,2
1432169290493476870,Mon Aug 30 02:32:21 +0000 2021,Update from Slidell PD back home. I'm worried ...,1125613827461087239,,,en,2,12
1431913119039860737,Sun Aug 29 09:34:25 +0000 2021,Le «dangereux» ouragan Ida est maintenant de c...,85334007,,,fr,3,5
1432059377415036940,Sun Aug 29 19:15:36 +0000 2021,When you decide to turn on some music and the ...,4867778235,,,en,0,5
1432333373578620932,Mon Aug 30 13:24:22 +0000 2021,"FEMA, Homeland Security, National Guard, Red C...",1321432089225535490,,"{'id': '4ec01c9dbc693497', 'url': 'https://api...",en,0,1


### Clean Data

In [70]:
import re
import contractions

In [71]:
def strip_url(t):
    return re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', t, flags=re.MULTILINE)

In [90]:
def clean_twitter(df):
    
    # select only english text
    df = df[df.lang == 'en']
    text = df.full_text
    
    print('Average twitter length before cleaning: ', text.apply(lambda x: len(x)).mean())
    
    # lower characters
    text = text.str.lower()  
    # strip urls
    text = text.apply(lambda x: strip_url(x))
    # revert contraction
    text = text.apply(lambda x: contractions.fix(x))
    # remove non-alphanumeric
    text = text.str.replace('[^a-zA-Z]', ' ', regex=True)
    # remove extra spaces
    text = text.str.replace('\s+', ' ', regex=True)

    print('Average twitter length after cleaning: ', text.apply(lambda x: len(x)).mean())
    
    return text

In [91]:
text_pre = clean_twitter(pre_h)

Average twitter length before cleaning:  159.19382073700584
Average twitter length after cleaning:  136.79279085984015


In [97]:
text_in = clean_twitter(in_h)

Average twitter length before cleaning:  174.71709384594848
Average twitter length after cleaning:  149.05096115338407


In [98]:
text_post = clean_twitter(post_h)

Average twitter length before cleaning:  197.19462227912933
Average twitter length after cleaning:  165.8894804232091


### Preprocess Data

In [81]:
from nltk.corpus import stopwords

In [85]:
from nltk.stem import WordNetLemmatizer

In [82]:
def remove_stopwords(s):
    processed_s = ""
    cached_stopwords = stopwords.words('english')
    for word in s.split():
        if word not in cached_stopwords:
            processed_s += (word+' ')
    return processed_s

In [86]:
def lemmatize(s):
    lemmatizer = WordNetLemmatizer()
    result = ""
    for word in s.split():
        result += (lemmatizer.lemmatize(word, pos='v') + ' ')
    return result

In [92]:
def preprocess_twitter(text):
    
    print('Average twitter length before cleaning: ', text.apply(lambda x: len(x)).mean())

    text = text.apply(lambda x: remove_stopwords(x))
    text = text.apply(lambda x: lemmatize(x))

    print('Average twitter length after cleaning: ', text.apply(lambda x: len(x)).mean())
    
    return text

In [93]:
text_pre = preprocess_twitter(text_pre)

Average twitter length before cleaning:  136.79279085984015
Average twitter length after cleaning:  99.08281928874108


In [99]:
text_in = preprocess_twitter(text_in)

Average twitter length before cleaning:  149.05096115338407
Average twitter length after cleaning:  109.32922840742224


In [100]:
text_post = preprocess_twitter(text_post)

Average twitter length before cleaning:  165.8894804232091
Average twitter length after cleaning:  123.87189163690276


### Save Preprocessed Data

In [101]:
text_pre.to_csv('processed_pre_h.csv')

In [102]:
text_in.to_csv('processed_in_h.csv')

In [103]:
text_post.to_csv('processed_post_h.csv')

### (Archive)Tests

In [65]:
# select only english text
pre_h = pre_h[pre_h.lang == 'en']

In [66]:
text = pre_h.full_text

In [67]:
# lower characters
text = text.str.lower()  

In [68]:
text.apply(lambda x: len(x)).mean()

159.19384219278012

In [69]:
text.iloc[1]

"update from slidell pd back home. i'm worried about my folks. #hurricaneida #slidell https://t.co/v8gopihvei"

In [72]:
text = text.apply(lambda x: strip_url(x))

In [73]:
text.iloc[1]

"update from slidell pd back home. i'm worried about my folks. #hurricaneida #slidell "

In [74]:
# strip urls
text.apply(lambda x: len(x)).mean()

145.23494072842354

In [76]:
# revert contraction
text = text.apply(lambda x: contractions.fix(x))

In [77]:
text.apply(lambda x: len(x)).mean()

145.6231293246795

In [78]:
# remove non-alphanumeric
text = text.str.replace('[^a-zA-Z]', ' ', regex=True)

In [79]:
# remove extra spaces
text = text.str.replace('\s+', ' ', regex=True)

In [80]:
text.apply(lambda x: len(x)).mean()

136.79279085984015

In [84]:
text.apply(lambda x: len(x)).mean()

102.88265837043394

In [83]:
text = text.apply(lambda x: remove_stopwords(x))

In [87]:
text = text.apply(lambda x: lemmatize(x))

In [88]:
text.apply(lambda x: len(x)).mean()

99.08281928874108

In [96]:
all(text == text_pre)

True