# ETL

In [1]:
import pandas as pd
pd.options.display.max_rows = 7000
pd.options.display.max_colwidth = 250
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

## Test Dataset

In [2]:
df = pd.read_excel("../data/COVID19_Twitter_Dataset.xlsx")

In [3]:
df.head()

Unnamed: 0,Is_Unreliable,Category,Tweet
0,1,"1, 3, 6, 9",We are living in scary times in Canada. Gov’t refuses to protect Canadians from coronavirus
1,1,"1, 6, 8, 9","Just as bad in Canada. In fact, our government is now accusing us of racism for blaming the Chinese for coronavirus"
2,1,"1, 4, 9","It was only a matter of time before the mainstream media decided to blame the coronavirus on climate change, I suppose."
3,1,"6, 8","Russia's taking no chances: Foreigners infected with the new Chinese coronavirus will be quarantined, isolated &amp; deported, PM"
4,1,"6, 8, 9","Although there is now a presumptive confirmed case of Wuhan novel coronavirus, I want you to know that Ontario is prepared"


In [4]:
df = df[['Tweet', 'Is_Unreliable']]
df.columns = [col.lower() for col in df.columns.to_list()]

In [5]:
df.columns = ['text', 'label']

In [6]:
df.label = df.label.map({0:'true', 1:'unreliable'})

In [7]:
# lower lettering and trimming spaces for all values
text_cols = df.columns.to_list()
text_cols.remove('label')
for col in text_cols:
    df[col] = df[col].str.strip().str.lower()

In [8]:
df.head()

Unnamed: 0,text,label
0,we are living in scary times in canada. gov’t refuses to protect canadians from coronavirus,unreliable
1,"just as bad in canada. in fact, our government is now accusing us of racism for blaming the chinese for coronavirus",unreliable
2,"it was only a matter of time before the mainstream media decided to blame the coronavirus on climate change, i suppose.",unreliable
3,"russia's taking no chances: foreigners infected with the new chinese coronavirus will be quarantined, isolated &amp; deported, pm",unreliable
4,"although there is now a presumptive confirmed case of wuhan novel coronavirus, i want you to know that ontario is prepared",unreliable


In [9]:
df[df.duplicated(subset=['text'], keep=False)].sort_values('text').head(30)

Unnamed: 0,text,label
394,"according to health officials, this is what you will need for a #coronavirus home quarantine for 14 days #covid-19",True
397,"according to health officials, this is what you will need for a #coronavirus home quarantine for 14 days #covid-19",True
395,coronavirus: how does it spread and what are the symptoms? here’s everything we know about covid-19,True
557,coronavirus: how does it spread and what are the symptoms? here’s everything we know about covid-19,True
307,don't fall victim to misinformation and even worst fake news. get the facts about the #coronavirus from @topublichealth.,True
489,don't fall victim to misinformation and even worst fake news. get the facts about the #coronavirus from @topublichealth.,True


In [10]:
print('before dropping duplicates', len(df))
df = df.drop_duplicates(keep='first')
print('after', len(df))

before dropping duplicates 560
after 557


## Cleaning Text

In [11]:
# removing links
reg_url = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
df['cleaned'] = df.text.str.replace(reg_url, ' ', regex=True)

In [12]:
reg_mention = r'@\w+'
df.cleaned = df.cleaned.str.replace(reg_mention, ' ', regex=True)

In [13]:
reg_rt = r'\s+RT\s+'
df.cleaned = df.cleaned.str.replace(reg_rt, ' ', regex=True)

In [14]:
# change 5g so it doesn't get removed with alphabetic exception
reg_5g = r'5g'
df.cleaned = df.cleaned.str.replace(reg_5g, 'fiveg', regex=True)

In [15]:
# remove all non alphabetic characters
reg_non_alpha = r'[^a-z]'
df.cleaned = df.cleaned.str.replace(reg_non_alpha, ' ', regex=True)


In [16]:
# replace back fiveg to 5g
reg_fiveg = r'fiveg'
df.cleaned = df.cleaned.str.replace(reg_fiveg, '5g', regex=True)

In [17]:
reg_spaces = r'\\s{2,}'
# applying regex to remove links, non-alphas, and 2 or more spaces
df.cleaned = df.cleaned.str.replace(reg_spaces, '', regex=True)

In [18]:
# trimming leading and trailing spaces again since regex replacement creates some unwanted spaces
df.cleaned = df.cleaned.str.strip()

In [19]:
# removing stop words
df.cleaned = df.cleaned.apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))

In [20]:
# lemmatization only
lemmatizer = WordNetLemmatizer()

# preview
lemmatizer.lemmatize(df.cleaned[0])

'living scary times canada gov refuses protect canadians coronavirus'

In [21]:
df.cleaned = df.cleaned.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

In [22]:
# replacing back the word u to us since lemmatization makes a mistake on US
df.cleaned = df.cleaned.str.replace(r'\su\s', ' us ', regex=True)

In [23]:
df.head(50)

Unnamed: 0,text,label,cleaned
0,we are living in scary times in canada. gov’t refuses to protect canadians from coronavirus,unreliable,living scary time canada gov refuse protect canadian coronavirus
1,"just as bad in canada. in fact, our government is now accusing us of racism for blaming the chinese for coronavirus",unreliable,bad canada fact government accusing us racism blaming chinese coronavirus
2,"it was only a matter of time before the mainstream media decided to blame the coronavirus on climate change, i suppose.",unreliable,matter time mainstream medium decided blame coronavirus climate change suppose
3,"russia's taking no chances: foreigners infected with the new chinese coronavirus will be quarantined, isolated &amp; deported, pm",unreliable,russia taking chance foreigner infected new chinese coronavirus quarantined isolated amp deported pm
4,"although there is now a presumptive confirmed case of wuhan novel coronavirus, i want you to know that ontario is prepared",unreliable,although presumptive confirmed case wuhan novel coronavirus want know ontario prepared
5,"hooray, finally there is a propaganda banner telling people they can make babies if getting bored staying home #coronavirus",unreliable,hooray finally propaganda banner telling people make baby getting bored staying home coronavirus
6,#russia is pushing propaganda claiming the #coronavirus is a us bio weapon targeting #china because of #trump’s trade,unreliable,russia pushing propaganda claiming coronavirus us bio weapon targeting china trump trade
7,who says it's not safe to travel to china? | @nytimes https://t.co/xncdnqumwi #travelmarketing #coronavirus #china #travelwarning #travel #thursdaythoughts #toronto,unreliable,say safe travel china travelmarketing coronavirus china travelwarning travel thursdaythoughts toronto
8,the best defense against disturbing new diseases like #coronavirus is bolstering public health systems of the world’s,unreliable,best defense disturbing new disease like coronavirus bolstering public health system world
9,"china, desperate to stop coronavirus, turns neighbor against neighbor “(people from wuhan) are pariahs in china, among the millions unable to go home and feared as potential carriers of the mysterious coronavirus.",unreliable,china desperate stop coronavirus turn neighbor neighbor people wuhan pariah china among million unable go home feared potential carrier mysterious coronavirus


In [24]:
df[df.duplicated(subset=['cleaned'], keep=False)].sort_values('cleaned').head(20)

Unnamed: 0,text,label,cleaned


In [25]:
print('before dropping duplicates', len(df))
df = df.drop_duplicates(subset = ['cleaned'], keep='first')
print('after', len(df))

before dropping duplicates 557
after 557


In [26]:
df['num_words'] = df.cleaned.str.split().apply(len)

In [27]:
df.head()

Unnamed: 0,text,label,cleaned,num_words
0,we are living in scary times in canada. gov’t refuses to protect canadians from coronavirus,unreliable,living scary time canada gov refuse protect canadian coronavirus,9
1,"just as bad in canada. in fact, our government is now accusing us of racism for blaming the chinese for coronavirus",unreliable,bad canada fact government accusing us racism blaming chinese coronavirus,10
2,"it was only a matter of time before the mainstream media decided to blame the coronavirus on climate change, i suppose.",unreliable,matter time mainstream medium decided blame coronavirus climate change suppose,10
3,"russia's taking no chances: foreigners infected with the new chinese coronavirus will be quarantined, isolated &amp; deported, pm",unreliable,russia taking chance foreigner infected new chinese coronavirus quarantined isolated amp deported pm,13
4,"although there is now a presumptive confirmed case of wuhan novel coronavirus, i want you to know that ontario is prepared",unreliable,although presumptive confirmed case wuhan novel coronavirus want know ontario prepared,11


In [28]:
df[df.num_words == 3]

Unnamed: 0,text,label,cleaned,num_words
243,coronavirus is fake news,unreliable,coronavirus fake news,3
247,@koshtorontosun don't lie to us #coronaviruse,unreliable,lie us coronaviruse,3
499,how to tell if a cold is covid-19,true,tell cold covid,3


In [29]:
df[df.num_words == 2]

Unnamed: 0,text,label,cleaned,num_words


In [30]:
# removal of less than 3 words
df = df[df.num_words > 2]

In [31]:
df.label.value_counts(), df.label.value_counts(normalize=True)

(unreliable    280
 true          277
 Name: label, dtype: int64,
 unreliable    0.502693
 true          0.497307
 Name: label, dtype: float64)

In [32]:
df.to_pickle('../data/cleaned_tweets_test.pkl')