## 1. Import libraries 

In [1]:
import pandas as pd
import requests 
import json 
import csv 
import time
import datetime
import numpy as np

## 2. Functions 

In [2]:
def separate_missing_values(endpoint, filename, attrs):
    df = pd.read_csv(filename)    
    print("Number of original records: ", len(df))

    #replace [deleted] messages
    if endpoint=='submission':
        df['selftext'].replace('[deleted]',np.NaN,inplace=True)
    
    #delete records with no text in title and text
    df = df.dropna(subset=attrs, how='all')
    
    df.to_csv('cleanned_' + filename, index=False)
    print('Number of valid records:', len(df))
    

## 3. Cleanning data

In [21]:
separate_missing_values('comment', 'data_covid_comment.csv', ['body'])

Number of original records:  829
Number of valid records: 829


In [22]:
separate_missing_values('submission', 'data_covid_submission.csv', ['selftext', 'title'])

Number of original records:  105
Number of valid records: 105


## Transforming data

Index(['comment_id', 'author', 'score', 'created', 'permalink', 'flair',
       'body', 'parent_id', 'subreddit', 'subreddit_id'],
      dtype='object')
Index(['submission_id', 'title', 'url', 'author', 'score', 'created',
       'num_comments', 'permalink', 'flair', 'selftext', 'subreddit',
       'subreddit_id'],
      dtype='object')


In [31]:
df = pd.read_csv('cleanned_data_covid_comment.csv')    
df2 = pd.read_csv('cleanned_data_covid_submission.csv')    

#transform comments
df['type'] = 'comment'
df['id'] = df['comment_id']
df['text'] = df['body']
df = df[['id', 'type', 'text', 'created']]

#transform submissions
df2['type'] = 'submission'
df2['id'] = df2['submission_id']
df2 = df2.replace(np.nan, '', regex=True) ## replacing NaN values with empty strings before merging
df2['text'] = df2['selftext'] + df2['title']
df2 = df2[['id', 'type', 'text', 'created']]

#merge datasets
ndf = pd.concat([df, df2])
ndf.to_csv('transform_data.cv', index=False)



In [32]:
df.head()

Unnamed: 0,id,type,text,created
0,fj83m8c,comment,The report (PDF) https://www.who.int/docs/defa...,2020-03-01 18:13:23
1,fjdvlek,comment,"Within the fever clinics in Guangdong, the per...",2020-03-03 13:28:33
2,fji62i6,comment,Yeah I've said for a while that there's basica...,2020-03-04 17:33:29
3,fkcki9q,comment,&gt; now risking a hundred people is a few hou...,2020-03-12 19:17:07
4,fkfiml1,comment,"&gt; strongly disagree with the ""and we now kn...",2020-03-13 17:33:41


In [33]:
df2.head()

Unnamed: 0,id,type,text,created
0,fkk4tj,submission,The cumulative number of confirmed COVID-19 in...,2020-03-18 01:04:16
1,fkkcwe,submission,The cumulative number of confirmed COVID-19 in...,2020-03-18 01:23:49
2,fm2nql,submission,A Rough Guide to Getting a COVID-19 Lockdown R...,2020-03-20 17:00:12
3,fme8v1,submission,Covid-19 has caused the UK in a partial lockdown,2020-03-21 08:30:21
4,fmea7j,submission,Covid-19 has cause the UK to be in a partial l...,2020-03-21 08:33:43


In [34]:
ndf.head()

Unnamed: 0,id,type,text,created
0,fj83m8c,comment,The report (PDF) https://www.who.int/docs/defa...,2020-03-01 18:13:23
1,fjdvlek,comment,"Within the fever clinics in Guangdong, the per...",2020-03-03 13:28:33
2,fji62i6,comment,Yeah I've said for a while that there's basica...,2020-03-04 17:33:29
3,fkcki9q,comment,&gt; now risking a hundred people is a few hou...,2020-03-12 19:17:07
4,fkfiml1,comment,"&gt; strongly disagree with the ""and we now kn...",2020-03-13 17:33:41


In [35]:
len(ndf)

934

In [None]:
df = pd.read_csv('') 