# Preprocess Tweets

In [1]:
import os
import re
import numpy as np
import pandas as pd

In [2]:
tweets_raw = pd.read_csv("tweets_200.csv")
tweets_raw.head(10)

Unnamed: 0,User ID,Tweet
0,44196397,@stephenpallotta @ajtourville @Teslarati Yes. ...
1,44196397,@PPathole @austinbarnard45 @LabPadre Pretty mu...
2,44196397,We have extra FDA-approved ventilators. Will s...
3,44196397,@brandonbernicky @thirdrowtesla Hoping to roll...
4,44196397,@SteveHamel16 @JordanWells33 @hereforthecom19 ...
5,44196397,@thirdrowtesla Homelink is 3rd party hardware ...
6,44196397,@NYCMayor Biggest value Tesla is providing is ...
7,44196397,@PPathole @flcnhvy @Tesla C19 testing in the U...
8,44196397,@28delayslater @thirdrowtesla Invasive ventila...
9,44196397,"@kimitalvitie Yes, v close! Nice work. Those a..."


In [3]:
def convert_string(text):
    # remove emojis
    text = re.sub(r"(\u00a9|\u00ae|[\u2000-\u3300]|\ud83c[\ud000-\udfff]|\ud83d[\ud000-\udfff]|\ud83e[\ud000-\udfff])", "", text)
    # remove non-ascii characters 
    text = re.sub(r"[^\x00-\x7f]", "", text)
    # remove @usernames
    text = re.sub(r"(@|#)([A-Z]|[a-z]|[0-9]|_)+", "", text)
    # remove urls
    text = re.sub(r"(http|https)://([A-Z]|[a-z]|[0-9]|/|\.)+", "", text)
    # remove spaces
    text = text.strip()
    # return
    return text

In [4]:
tweets = tweets_raw.copy()
tweets["Tweet"] = tweets["Tweet"].apply(convert_string)
tweets.head(10)

Unnamed: 0,User ID,Tweet
0,44196397,Yes. PCB design techniques applied to create a...
1,44196397,Pretty much. Good news is that this was a test...
2,44196397,We have extra FDA-approved ventilators. Will s...
3,44196397,Hoping to roll out traffic lights &amp; stops ...
4,44196397,"Yup, China had an oversupply, so we bought 125..."
5,44196397,Homelink is 3rd party hardware that increases ...
6,44196397,Biggest value Tesla is providing is precise de...
7,44196397,C19 testing in the US over the past week has g...
8,44196397,Invasive ventilators are for worst case patien...
9,44196397,"Yes, v close! Nice work. Those are V0.9 legs, ..."


In [5]:
tweets.shape

(313536, 2)

In [6]:
# remove empty columns and one-word columns
tweets.drop(tweets["Tweet"][tweets["Tweet"] == ""].index, inplace=True)
tweets = tweets[tweets["Tweet"].str.contains(" ")] # need at least one space for at least 2 words
tweets.drop_duplicates(inplace=True)
tweets.reset_index(drop=True)
tweets.head(5)

Unnamed: 0,User ID,Tweet
0,44196397,Yes. PCB design techniques applied to create a...
1,44196397,Pretty much. Good news is that this was a test...
2,44196397,We have extra FDA-approved ventilators. Will s...
3,44196397,Hoping to roll out traffic lights &amp; stops ...
4,44196397,"Yup, China had an oversupply, so we bought 125..."


In [7]:
tweets.shape

(300785, 2)

In [8]:
tweets.to_csv("tweets_200_processed.csv", index=False)