## 1. Import libraries 

In [1]:
import pandas as pd
import requests 
import json 
import csv 
import time
import datetime
import numpy as np

## 2. Functions 

In [2]:
def separate_missing_values(endpoint, filename, attrs):
    df = pd.read_csv(filename)    
    print("Number of original records: ", len(df))

    #replace [deleted] messages
    if endpoint=='submission':
        df['selftext'].replace('[deleted]',np.NaN,inplace=True)
    
    #delete records with no text in title and text
    df = df.dropna(subset=attrs, how='all')
    
    df.to_csv('cleanned_' + filename, index=False)
    print('Number of valid records:', len(df))
    

## 3. Cleanning data

In [3]:
separate_missing_values('comment', 'data_art_comment.csv', ['body'])

Number of original records:  1100
Number of valid records: 1100


In [4]:
separate_missing_values('submission', 'data_art_submission.csv', ['selftext', 'title'])

Number of original records:  4902
Number of valid records: 4902


## Transforming data

Index(['comment_id', 'author', 'score', 'created', 'permalink', 'flair',
       'body', 'parent_id', 'subreddit', 'subreddit_id'],
      dtype='object')
Index(['submission_id', 'title', 'url', 'author', 'score', 'created',
       'num_comments', 'permalink', 'flair', 'selftext', 'subreddit',
       'subreddit_id'],
      dtype='object')


In [6]:
df = pd.read_csv('cleanned_data_art_comment.csv')    
df2 = pd.read_csv('cleanned_data_art_submission.csv')    

#transform comments
df['type'] = 'comment'
df['id'] = df['comment_id']
df['text'] = df['body']
df = df[['id', 'type', 'text', 'created']]

#transform submissions
df2['type'] = 'submission'
df2['id'] = df2['submission_id']
df2 = df2.replace(np.nan, '', regex=True) ## replacing NaN values with empty strings before merging
df2['text'] = df2['selftext'] + df2['title']
df2 = df2[['id', 'type', 'text', 'created']]

#merge datasets
ndf = pd.concat([df, df2])
ndf.to_csv('art_transform_data.cv', index=False)



In [7]:
df.head()

Unnamed: 0,id,type,text,created
0,fg638ss,comment,https://www.google.com/amp/s/www.timeslive.co....,2020-02-01 05:01:31
1,fg6c0h8,comment,It’s actually an art project. I saw it in my l...,2020-02-01 08:19:18
2,fg6jyk6,comment,I don't know if they use this reddit but I not...,2020-02-01 10:13:03
3,fg6mfki,comment,Jayjel if it was possible your art is getting ...,2020-02-01 10:42:51
4,fg6n6xl,comment,You're referring to works by an artist named M...,2020-02-01 10:51:42


In [8]:
df2.head()

Unnamed: 0,id,type,text,created
0,ex50db,submission,The first question one should ask is what even...,2020-02-01 06:44:55
1,exb289,submission,Good morning Los Angeles! back in October/Nove...,2020-02-01 14:16:52
2,exdhp6,submission,@TheEconomist: An exhibition @FoundlingMuseum ...,2020-02-01 17:09:26
3,exdtao,submission,(Note: I would be very interested in replies s...,2020-02-01 17:32:15
4,exehim,submission,Friends art exhibition looked a good place to ...,2020-02-01 18:16:25


In [9]:
ndf.head()

Unnamed: 0,id,type,text,created
0,fg638ss,comment,https://www.google.com/amp/s/www.timeslive.co....,2020-02-01 05:01:31
1,fg6c0h8,comment,It’s actually an art project. I saw it in my l...,2020-02-01 08:19:18
2,fg6jyk6,comment,I don't know if they use this reddit but I not...,2020-02-01 10:13:03
3,fg6mfki,comment,Jayjel if it was possible your art is getting ...,2020-02-01 10:42:51
4,fg6n6xl,comment,You're referring to works by an artist named M...,2020-02-01 10:51:42


In [10]:
len(ndf)

6002