# Project 3: SubReddit Classifier

---

# Data Collection and Cleaning
---

## Bookclub


Scraping top 1000 posts from subreddit - 'bookclub'. Once the data is fully scrapped, the duplicate posts, null values posts and unwanted columns are removed to obtain a clean dataset and saved into a csv file.

### Data Collection

In [4]:
#Imports:
import requests
import pandas as pd
import time
import random

In [5]:
# specify the url
url = 'https://www.reddit.com/r/bookclub.json'

In [6]:
#requesting the website
res = requests.get(url)

In [7]:
#check the response status code
res.status_code

429

In [8]:
#Python has its own default user agent. 
#Since there are so many scripts out there that are already 'hitting' reddit's API,
#reddit is basically shutting down all Python scripts from accessing its API.
#We will change our request a little bit to make it not use the default user agent.
#creating useragent
res = requests.get(url, headers={'User-agent': 'books'})

In [9]:
#check the response status code
res.status_code

200

In [10]:
#Sends a JSON response composed of the specified data, stored in reddit_dict
reddit_dict = res.json()

In [11]:
posts = []
after = None

# since we need 1100 posts ,the loop runs 45 times
for a in range(45):
    if after == None:
        current_url = url
    else:
        current_url = url + '?after=' + after  # reddit_dict['data']['after'] gives the name of last post 
    print(current_url)
    res = requests.get(current_url, headers={'User-agent': 'books'}) #defining the user agent
    
    if res.status_code != 200: # if reponse is not positive print status error and break -to run the program
        print('Status error', res.status_code)
        break
    
    current_dict = res.json() #storing JSON response
    current_posts = [p['data'] for p in current_dict['data']['children']]
    posts.extend(current_posts)  # update in list called post
    after = current_dict['data']['after']  # update the value of last post in the current batch
    
    # generate a random sleep duration to look more 'natural'
    sleep_duration = random.randint(2,60)
    print(sleep_duration)
    time.sleep(sleep_duration)

https://www.reddit.com/r/bookclub.json
22
https://www.reddit.com/r/bookclub.json?after=t3_e2bc5a
13
https://www.reddit.com/r/bookclub.json?after=t3_deawbb
53
https://www.reddit.com/r/bookclub.json?after=t3_d0abt7
36
https://www.reddit.com/r/bookclub.json?after=t3_clnw03
46
https://www.reddit.com/r/bookclub.json?after=t3_c64mug
4
https://www.reddit.com/r/bookclub.json?after=t3_bqjr2z
22
https://www.reddit.com/r/bookclub.json?after=t3_b2jslo
24
https://www.reddit.com/r/bookclub.json?after=t3_aowt30
36
https://www.reddit.com/r/bookclub.json?after=t3_aj4r56
3
https://www.reddit.com/r/bookclub.json?after=t3_a9mxk7
2
https://www.reddit.com/r/bookclub.json?after=t3_9r1z08
44
https://www.reddit.com/r/bookclub.json?after=t3_9e7s20
39
https://www.reddit.com/r/bookclub.json?after=t3_97wc7k
36
https://www.reddit.com/r/bookclub.json?after=t3_913b4e
33
https://www.reddit.com/r/bookclub.json?after=t3_8uuf7r
39
https://www.reddit.com/r/bookclub.json?after=t3_8odwcr
4
https://www.reddit.com/r/bookclub.

In [12]:
posts = []
after = None

for a in range(45):
    if after == None:
        current_url = url
    else:
        current_url = url + '?after=' + after
    print(current_url)
    res = requests.get(current_url, headers={'User-agent': 'books'})
    
    if res.status_code != 200:
        print('Status error', res.status_code)
        break
    
    current_dict = res.json()
    current_posts = [p['data'] for p in current_dict['data']['children']]
    posts.extend(current_posts)
    after = current_dict['data']['after']
    
     # store the information in csv file
    
    if a > 0:
        prev_posts = pd.read_csv('../data/books.csv')
        current_df = pd.DataFrame()
        
    else:
        pd.DataFrame(posts).to_csv('../data/books.csv', index = False)

    # generate a random sleep duration to look more 'natural'
    sleep_duration = random.randint(2,6)
    print(sleep_duration)
    time.sleep(sleep_duration)

https://www.reddit.com/r/bookclub.json
4
https://www.reddit.com/r/bookclub.json?after=t3_e2bc5a
3
https://www.reddit.com/r/bookclub.json?after=t3_deawbb
2
https://www.reddit.com/r/bookclub.json?after=t3_d0abt7
3
https://www.reddit.com/r/bookclub.json?after=t3_clnw03
6
https://www.reddit.com/r/bookclub.json?after=t3_c64mug
2
https://www.reddit.com/r/bookclub.json?after=t3_bqjr2z
4
https://www.reddit.com/r/bookclub.json?after=t3_b2jslo
4
https://www.reddit.com/r/bookclub.json?after=t3_aowt30
6
https://www.reddit.com/r/bookclub.json?after=t3_aj4r56
2
https://www.reddit.com/r/bookclub.json?after=t3_a9mxk7
6
https://www.reddit.com/r/bookclub.json?after=t3_9r1z08
4
https://www.reddit.com/r/bookclub.json?after=t3_9e7s20
4
https://www.reddit.com/r/bookclub.json?after=t3_97wc7k
3
https://www.reddit.com/r/bookclub.json?after=t3_913b4e
6
https://www.reddit.com/r/bookclub.json?after=t3_8uuf7r
6
https://www.reddit.com/r/bookclub.json?after=t3_8odwcr
3
https://www.reddit.com/r/bookclub.json?after=t3

In [13]:
# store the csv file
pd.DataFrame(posts).to_csv('../data/books.csv', index = False)

### Data Cleaning

In [3]:
# reading the csv file into pandas dataframes
books=pd.read_csv('../data/books.csv')

In [4]:
#displaying the first 5 rows of dataframe
books.head()

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,crosspost_parent_list,crosspost_parent
0,,bookclub,Hello! \n\nWe have had an increase in posts th...,t2_51fft,False,,0,False,Appropriate Posts,[],...,all_ads,True,https://www.reddit.com/r/bookclub/comments/clq...,82239,1564883000.0,0,,False,,
1,,bookclub,Hi folks. We are starting a little later into ...,t2_7o4lq,False,,0,False,Moon of the Crusted Snow schedule,[],...,all_ads,True,https://www.reddit.com/r/bookclub/comments/ehe...,82239,1577669000.0,0,,False,,
2,,bookclub,Here's a discussion post for chapters 20-22 of...,t2_7o4lq,False,,0,False,[Scheduled] Moon of the Crusted Snow chapters ...,[],...,all_ads,False,https://www.reddit.com/r/bookclub/comments/esm...,82239,1579748000.0,0,,False,,
3,,bookclub,Did anyone make a character list for Moon of t...,t2_5a7so90c,False,,0,False,Character List (Moon of the Uncrusted Snow),[],...,all_ads,False,https://www.reddit.com/r/bookclub/comments/esn...,82239,1579749000.0,0,,False,,
4,,bookclub,,t2_4noz5f81,False,,0,False,r/ThomasPynchon is hosting a 'Gravity's Rainbo...,[],...,all_ads,False,/r/ThomasPynchon/comments/erdoue/gravitys_rain...,82239,1579530000.0,0,,False,"[{'approved_at_utc': None, 'subreddit': 'Thoma...",t3_erdoue


In [5]:
# checking the number of columns and rows
books.shape

(1116, 102)

In [6]:
#checking the number of duplicate rows
books[books.duplicated(keep=False)]

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,crosspost_parent_list,crosspost_parent
2,,bookclub,Here's a discussion post for chapters 20-22 of...,t2_7o4lq,False,,0,False,[Scheduled] Moon of the Crusted Snow chapters ...,[],...,all_ads,False,https://www.reddit.com/r/bookclub/comments/esm...,82239,1.579748e+09,0,,False,,
10,,bookclub,I quickly read the chapters outlined in the sc...,t2_2ys69pqq,False,,0,False,Moon of the Crusted Snow: What do you think wi...,[],...,all_ads,False,https://www.reddit.com/r/bookclub/comments/ek9...,82239,1.578207e+09,0,,False,,
13,,bookclub,**And now for something completely different (...,t2_w9yhm,False,,0,False,[Scheduled] The Three-Body Problem ch 21-26,[],...,all_ads,False,https://www.reddit.com/r/bookclub/comments/ef3...,82239,1.577205e+09,0,,False,,
14,,bookclub,**Quick-ish Summary:** Ye Wenjie wraps up her ...,t2_w9yhm,False,,0,False,[Scheduled] The Three-Body Problem ch 14-20,[],...,all_ads,False,https://www.reddit.com/r/bookclub/comments/edr...,82239,1.576945e+09,0,,False,,
17,,bookclub,"Sorry for the delay! When I made the schedule,...",t2_w9yhm,False,,0,False,[Scheduled] The Three-Body Problem ch 7-13,[],...,all_ads,False,https://www.reddit.com/r/bookclub/comments/eat...,82239,1.576382e+09,0,,False,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1106,,bookclub,"Sorry this went up late, guys! It’s been a bit...",t2_7o4lq,False,,0,False,[Jane Eyre] Chapters 15-17 Discussion,[],...,all_ads,False,https://www.reddit.com/r/bookclub/comments/ca1...,82239,1.562464e+09,0,,False,,
1108,,bookclub,"Hey, \n\n[July 9: Chapters 1-6](https://www.re...",t2_51fft,False,,0,False,Do Androids Dream of Electric Sheep schedule,[],...,all_ads,False,https://www.reddit.com/r/bookclub/comments/c8l...,82239,1.562141e+09,0,,False,,
1111,,bookclub,There've been a few things that seem like they...,t2_hz3c9,False,,0,False,Jane Eyre -- Ch 1-9 -- unresolved items,[],...,all_ads,False,https://www.reddit.com/r/bookclub/comments/c64...,82239,1.561637e+09,0,,False,,
1112,,bookclub,Hey everyone! \n\nThe book choice for July is ...,t2_51fft,False,,0,False,July's Selection,[],...,all_ads,False,https://www.reddit.com/r/bookclub/comments/c5w...,82239,1.561586e+09,0,,False,,


In [9]:
#dropping the duplicates
books.drop_duplicates(inplace=True)

In [10]:
# number of rows and columns after removing duplicates
books.shape

(1083, 102)

In [14]:
# columns in dataset
books.columns

Index(['approved_at_utc', 'subreddit', 'selftext', 'author_fullname', 'saved',
       'mod_reason_title', 'gilded', 'clicked', 'title', 'link_flair_richtext',
       ...
       'parent_whitelist_status', 'stickied', 'url', 'subreddit_subscribers',
       'created_utc', 'num_crossposts', 'media', 'is_video',
       'crosspost_parent_list', 'crosspost_parent'],
      dtype='object', length=102)

In [26]:
# storing the columns to be dropped in a variable
columns_drop=[column for column in books.columns if column !='subreddit'and column !='selftext' ]

In [28]:
#dropping unwanted columns
books.drop(columns_drop, axis=1, inplace=True)

In [33]:
# final dataframe
books.head()

Unnamed: 0,subreddit,selftext
0,bookclub,Hello! \n\nWe have had an increase in posts th...
1,bookclub,Hi folks. We are starting a little later into ...
2,bookclub,Here's a discussion post for chapters 20-22 of...
3,bookclub,Did anyone make a character list for Moon of t...
5,bookclub,I AM SO SORRY THIS IS LATE YOU GUYS I THOUGHT ...


In [30]:
#datatype of each variable in dataframe
books.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1083 entries, 0 to 1115
Data columns (total 2 columns):
subreddit    1083 non-null object
selftext     1074 non-null object
dtypes: object(2)
memory usage: 65.4+ KB


In [31]:
#dropping the rows with null values
books.dropna(axis = 0,inplace = True)

In [32]:
#datatype of each variable in dataframe
books.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1074 entries, 0 to 1115
Data columns (total 2 columns):
subreddit    1074 non-null object
selftext     1074 non-null object
dtypes: object(2)
memory usage: 25.2+ KB


In [36]:
#saving the clean dataframe into a csv file
books.loc[ :].to_csv('../data/final_books.csv',index=False)