In [1]:
import pandas as pd
import numpy as np

import re
from string import punctuation 

import nltk
nltk.download('words')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet

[nltk_data] Downloading package words to
[nltk_data]     /Users/macbookair/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [21]:
df = pd.read_csv('data/2020-03-14 Coronavirus Tweets.CSV')

In [22]:
# df.reset_index(inplace=True)

In [23]:
df = df[df.lang == 'en']
text = df['text']

In [24]:
class PreProcessTweets:
    def __init__(self):
        self._stopwords = set(stopwords.words('english') + list(punctuation) + ['AT_USER','URL'])
        
    def processTweets(self, list_of_tweets):
        processedTweets=[]
        for tweet in list_of_tweets:
            processedTweets.append((self._processTweet(tweet),tweet.index))
        return processedTweets
    
    def _processTweet(self, tweet):
        emoji_pattern = re.compile(
        "(["
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "])"
        )
        tweet = emoji_pattern.sub(r'',tweet) # remove emojis
        tweet = tweet.lower() # convert text to lower-case
        tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', tweet) # remove URLs
        tweet = re.sub('@[^\s]+', 'AT_USER', tweet) # remove usernames
        tweet = re.sub(r'#([^\s]+)', r'\1', tweet) # remove the # in #hashtag
        tweet = re.sub('amp','',tweet) # remove 'amp'
        tweet = word_tokenize(tweet) # remove repeated characters
        return [word for word in tweet if word not in self._stopwords]


In [25]:
prep = PreProcessTweets()
processed = prep.processTweets(text)
processed[:10]

[(['calling',
   'covid19',
   '“',
   'wuhan',
   'virus',
   '”',
   '“',
   'china',
   'virus',
   '”',
   'inaccurate',
   'xenophobic'],
  <function str.index>),
 (['world',
   'health',
   'organization',
   'declared',
   'europe',
   "'epicenter",
   'covid19',
   'pandemic',
   'recorded',
   'highest',
   'number',
   'cases',
   'deaths',
   'rest',
   'world',
   'combined',
   'aside',
   'china'],
  <function str.index>),
 (['coronavirus',
   'prevention',
   'handbook',
   '101',
   'science-based',
   'tips',
   'could',
   'save',
   'life',
   'wang',
   'zhou',
   'ad',
   'coronavirus',
   'virus',
   'coronavirusoutbreak',
   'covid2019',
   'wuhanvirus',
   'coronavirususa',
   'pandemic',
   'sarscov2',
   'china',
   'wuhan',
   'flu',
   'covid_19',
   'coronaoutbreak',
   'covid19'],
  <function str.index>),
 (['2020',
   'masters',
   'postponed',
   'due',
   'health',
   'concerns',
   'related',
   'spread',
   'coronavirus',
   'organisers',
   'confirme

In [26]:
df['tokens'] = processed
df.head()

Unnamed: 0,status_id,user_id,created_at,screen_name,text,source,reply_to_status_id,reply_to_user_id,reply_to_screen_name,is_quote,...,country_code,place_full_name,place_type,followers_count,friends_count,account_lang,account_created_at,verified,lang,tokens
0,1238615829341560832,1072590021201420294,2020-03-14T00:00:00Z,YalePediatrics,Calling #COVID19 the “Wuhan Virus” or “China V...,TweetDeck,,,,False,...,,,,512,1456,,2018-12-11T20:32:43Z,False,en,"([calling, covid19, “, wuhan, virus, ”, “, chi..."
3,1238615828691668992,15872418,2020-03-14T00:00:00Z,ABSCBNNews,The World Health Organization has declared Eur...,TweetDeck,,,,False,...,,,,6705845,1075,,2008-08-16T10:09:33Z,True,en,"([world, health, organization, declared, europ..."
5,1238615830344142848,846210311631851520,2020-03-14T00:00:00Z,mystylehfb,The Coronavirus Prevention Handbook: 101 Scien...,Buffer,,,,False,...,,,,5778,6334,,2017-03-27T04:00:34Z,False,en,"([coronavirus, prevention, handbook, 101, scie..."
7,1238615829270290434,44316192,2020-03-14T00:00:00Z,FOXSportsAsia,The 2020 Masters has been postponed due to hea...,TweetDeck,,,,False,...,,,,59831,298,,2009-06-03T07:39:43Z,True,en,"([2020, masters, postponed, due, health, conce..."
8,1238615829396123649,171548670,2020-03-14T00:00:00Z,RadioNLNews,.@wctlive in #Kamloops will restrict crowds at...,TweetDeck,,,,False,...,,,,6914,2136,,2010-07-27T16:17:02Z,False,en,"([.AT_USER, kamloops, restrict, crowds, show, ..."


In [27]:
df.drop(columns=['account_lang', 'account_created_at','reply_to_status_id', 
                 'reply_to_user_id', 'reply_to_screen_name','status_id', 'user_id', 'screen_name','source',
                'country_code','lang'], inplace=True)

In [28]:
df.reset_index(drop=True, inplace=True)

In [29]:
df.head(30)

Unnamed: 0,created_at,text,is_quote,is_retweet,favourites_count,retweet_count,place_full_name,place_type,followers_count,friends_count,verified,tokens
0,2020-03-14T00:00:00Z,Calling #COVID19 the “Wuhan Virus” or “China V...,False,False,666,0,,,512,1456,False,"([calling, covid19, “, wuhan, virus, ”, “, chi..."
1,2020-03-14T00:00:00Z,The World Health Organization has declared Eur...,False,False,1068,272,,,6705845,1075,True,"([world, health, organization, declared, europ..."
2,2020-03-14T00:00:00Z,The Coronavirus Prevention Handbook: 101 Scien...,False,False,3692,0,,,5778,6334,False,"([coronavirus, prevention, handbook, 101, scie..."
3,2020-03-14T00:00:00Z,The 2020 Masters has been postponed due to hea...,False,False,785,0,,,59831,298,True,"([2020, masters, postponed, due, health, conce..."
4,2020-03-14T00:00:00Z,.@wctlive in #Kamloops will restrict crowds at...,False,False,491,0,,,6914,2136,False,"([.AT_USER, kamloops, restrict, crowds, show, ..."
5,2020-03-14T00:00:00Z,#ICYMI \n#NSTnation: #Selangor has recorded th...,False,False,814,54,,,691224,412,False,"([icymi, nstnation, selangor, recorded, highes..."
6,2020-03-14T00:00:00Z,Coralville lab says it's creating enough mater...,False,False,599,2,,,20734,4322,False,"([coralville, lab, says, 's, creating, enough,..."
7,2020-03-14T00:00:00Z,PRO TIP: Almond milk has a very long shelf lif...,False,False,79117,0,,,770,1320,False,"([pro, tip, almond, milk, long, shelf, life, o..."
8,2020-03-14T00:00:00Z,🦠Choose precautions over panicking😌\n#coronavi...,False,False,2,0,,,8,51,False,"([choose, precautions, panicking, coronavirus,..."
9,2020-03-14T00:00:00Z,What was he thinking?\nhttps://t.co/FXvUh6xg3F...,False,False,19,0,,,87,529,False,"([thinking, coronavirus, covid19], <built-in m..."


In [30]:
df.to_csv('2020-03-14_working_updated.csv', index=False)