In [1]:
# import package
import requests 
import time
import pandas as pd
import numpy as np
from tqdm import tqdm

# please uncomment code below this comment for install tqdm package if you don't have
# !conda install -c conda-forge tqdm

---

## Data Gathering

Data from reddit used api for collect

In [14]:
# create function for gathering data from reddit
def get_subreddit(subreddit, size):
    '''
    parameter
    ----------
    subreddit: subreddit from reddit need to assign in string type
    size: quantity of threds (if size more than threads in subreddit this function
          will generate duplicate values)
    
    return
    ----------
    threads from subreddit in dataframe form
    '''
    headers = {'User-Agent': 'Test'}
    posts = []
    after = None
    for _ in tqdm(range(int(size/25))):
        if after == None:
            params = {}
        else:
            params = {'after': after}
        url = f'https://www.reddit.com/r/{subreddit}.json'
        res = requests.get(url, params = params, headers=headers)
        if res.status_code == 200:
            the_json = res.json()
            posts.extend(the_json['data']['children'])
            after = the_json['data']['after']
            # backup data if error happen
            backup = pd.DataFrame([posts[i]['data'] for i in range(len(posts))])
            backup.to_csv(f'../data/{subreddit}_backup.csv')
        else:
            print('Error')
            print(res.status_code)
            break
        
        # increase time for not trigger the reddit server
        time.sleep(np.random.randint(1, 2))
        
    # change list of data to dataframe
    df_subreddit = pd.DataFrame([posts[i]['data'] for i in range(len(posts))])
    print(f'{subreddit} corpus has {df_subreddit.shape[0]} documents and has {df_subreddit.shape[1]} features.')
          
    return df_subreddit

### Civil Engineering Corpus

In [41]:
# pull threads in spacex subreddit 
civil_corpus = get_subreddit('civilengineering', 1000)

100%|██████████████████████████████████████████████████████████████████████████████████| 40/40 [02:05<00:00,  3.14s/it]

civilengineering corpus has 997 documents and has 113 features.





In [46]:
# check duplicated rows in renew corpus
civil_corpus['title'].duplicated().sum()

6

In [47]:
# drop duplicated rows in renew corpus
civil_corpus.drop_duplicates(subset='title', inplace=True)

### Architecture Corpus

In [42]:
# pull threads in nasa subreddit 
arch_corpus = get_subreddit('architecture', 1000)

100%|██████████████████████████████████████████████████████████████████████████████████| 40/40 [01:54<00:00,  2.86s/it]

architecture corpus has 983 documents and has 118 features.





In [43]:
# check duplicated rows in nasa corpus
arch_corpus['title'].duplicated().sum()

85

In [45]:
# drop duplicated rows in nasa corpus
arch_corpus.drop_duplicates(subset='title', inplace=True)

### Merge SpaceX and Nasa Corpus

In [48]:
# count rows and columns in corpus
print(f'civil corpus {civil_corpus.shape[0]} rows {civil_corpus.shape[1]} columns')
print(f'arch corpus {arch_corpus.shape[0]} rows {arch_corpus.shape[1]} columns')

civil corpus 991 rows 113 columns
arch corpus 898 rows 118 columns


In [49]:
# merge spacex and nasa corpus
corpus = pd.concat([civil_corpus, arch_corpus])

In [50]:
# count subreddit values
corpus['subreddit'].value_counts(normalize=True).mul(100)

civilengineering    52.46162
architecture        47.53838
Name: subreddit, dtype: float64

In [51]:
# export corpus dataset to csv
corpus.to_csv('../data/civil_arch_corpus.csv', index=False)