In [1]:
import pandas as pd
import re
from unidecode import unidecode
import datetime
from langdetect import detect

pd.set_option('max_colwidth', None)

In [2]:
df = pd.read_csv('dataset\csv\coronavirus_submissions.csv')
df.columns

Index(['id', 'title', 'author', 'created_utc', 'domain', 'full_link',
       'num_comments', 'score', 'total_awards_received', 'is_self',
       'subreddit_subscribers'],
      dtype='object')

### Remove daily/automatic posts
-----

In [4]:
# Remove daily subreddit threads
df2 = df.drop(index = df[df['title'].str.contains(r'Daily Discussion|Daily General|Science Sticky')].index)
df2.shape

(356322, 11)

### Make a dataframe copy with subset of necessary columns
-----

In [5]:
df3 = df2[['id', 'score', 'num_comments', 'created_utc', 'title']]

In [6]:
df3.head()

Unnamed: 0,id,score,num_comments,created_utc,title
0,erj61t,1,94,1579552351,When should we start to get worried about the new outbreak?
1,es0m55,1,32,1579638986,"When the first reported US case of the new Chinese Coronavirus is in your state, and really close to where you live"
2,es1pn4,1,23,1579643380,Yikes
3,es2ooq,1,1,1579647443,"Join r/China_Flu, dedicated to monitoring current China outbreak 2019-nCoV"
4,es7jnl,1,1,1579671329,#plague2020


In [9]:
# Convert:
# '&amp': &, '&lt;': <, '&gt;': >
df3['title'].replace(regex={r'&amp;': '&', r'&lt;': '<', r'&gt;': '>'}, inplace=True)

In [2]:
'''
Run cell for strip_tags function (to strip HTML tags)
'''

from io import StringIO
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [11]:
# Remove HTML tags
df3['title'] = df3['title'].map(strip_tags)
# Present unicode data in ASCII
df3['title'] = df3['title'].map(unidecode)
# Remove URLs
df3['title'].replace(regex=r'http\S+|www\S+', value='', inplace=True)
# Remove parenthesis (), brackets [], {} and the characters within
df3['title'].replace(regex=r'[([{].*[)\]}]', value='', inplace=True)
# Replace curse words
df3['title'].replace(regex={r'(s|S)[h$@#*%!]{2}(t|T)': 'shit', r'(f|F)[$@#*%!cC]{2}[$@#*%!kK]': 'fuck', r'(b|B)ulls?[^\d\s]{3,}': 'bullshit'}, inplace=True)
# Standardise 'US' and 'UK'
df3['title'].replace(regex={r'\b(USA|U.S.A|u.s|usa)\b': 'US', r'\b(uk|U.K|u.k)\b': 'UK'}, inplace=True)
# Remove email addresses
df3['title'].replace(regex={r'\S+@\w+(\.\w+)+': ''}, inplace=True)
# Remove @handles e.g. @ABC. Preserves tags like Folding@Home
df3['title'].replace(regex={r'\B@': ''}, inplace=True)
# Remove all single and double quotes, except contractions
df3['title'].replace(regex={r'[\'\"]\B|\B[\'\"]|(\d+[\'\"])+(\d*)': ''}, inplace=True)
# Replace the rest of special characters and digits with space
df3['title'].replace(regex={r'[^A-Za-z !?\']': ' '}, inplace=True)
# Remove excess whitespace
df3['title'] = df3['title'].str.strip()
df3['title'].replace(regex={r'\s{2,}': ' '}, inplace=True)

df3.head()

Unnamed: 0,id,score,num_comments,created_utc,title
0,erj61t,1,94,1579552351,When should we start to get worried about the new outbreak?
1,es0m55,1,32,1579638986,When the first reported US case of the new Chinese Coronavirus is in your state and really close to where you live
2,es1pn4,1,23,1579643380,Yikes
3,es2ooq,1,1,1579647443,Join r China Flu dedicated to monitoring current China outbreak nCoV
4,es7jnl,1,1,1579671329,plague


In [12]:
# Remove rows with empty strings
df4 = df3[df3['title'].astype(bool)]
df4.shape

(355458, 5)

## Remove submissions that are not in English
-----

In [None]:
'''
- Detect and categorise language
- Time-consuming process
- Exception will be called if 'text' only contains special characters like '!','?' which are retained for VADER
'''
def label_language(text):
    try:
        return detect(text)
    except:
        return 0

In [None]:
df4['language'] = df4['title'].map(label_language)
df4['language'].unique()

In [None]:
# retain submissions in english
df4 = df4[df4['language'] == 'en']
df4.shape

-----

In [14]:
def timestamp_to_yearmonth(ts):
    return datetime.datetime.fromtimestamp(ts).strftime('%Y-%m')
def timestamp_to_week(ts):
    return datetime.datetime.fromtimestamp(ts).strftime('%Y-Wk%W')

In [None]:
df4['month'] = df4['created_utc'].map(timestamp_to_yearmonth)
df4['week'] = df4['created_utc'].map(timestamp_to_week)
df4.head()

In [17]:
df4.to_pickle('submissions_processed_preSA.pkl')