# Data Gathering and Cleaning

## Imports

In [150]:
import requests
import pandas as pd

In [151]:
#Pushshift API
base = 'https://api.pushshift.io/reddit/search/submission/'

## Functions

In [152]:
#function to create parameter dictionary (default set to 100 which is Pushshift API's maximum)
def set_param(sub, size = 100, date = None):
    parameters = {
        'subreddit': sub,
        'size': size,
        'before': date,
    }
    return parameters

In [153]:
#function to get posts based on parameters
def get_posts(params):
    res = requests.get(base, params)
    if res.status_code == 200:
        data = res.json()
        posts = data['data']
        return posts
    else:
        print('Status Code Error')

In [154]:
#function to get n*100 posts with one function (finds the timestamp of the last post and uses that as the new starting point)
def get_n00_posts(subreddit, n, before = None):
    params = set_param(subreddit, date = before)
    all_df = pd.DataFrame(get_posts(params))
    one_df = pd.DataFrame(get_posts(params))
    i = 1
    while i < n:
        i += 1
        last_date = one_df['created_utc'].iloc[-1]
        new_param = set_param(subreddit, date = last_date)
        one_df = pd.DataFrame(get_posts(new_param))
        all_df = pd.concat([all_df, one_df], ignore_index = True)
    return all_df

In [155]:
#function to keep only relevant columns
def col_keep(big_df):
    return big_df[['subreddit', 'selftext', 'title']]

## Data Gathering

In [135]:
#gathering data from first subreddit
lin_df = get_n00_posts('linguistics', 10, before = 1648328400)

In [136]:
#checking data size
lin_df.shape

(999, 82)

In [147]:
#gathering data from second subreddit
con_df = get_n00_posts('conlangs', 10, before = 1648328400)

In [148]:
#checking data size
con_df.shape

(997, 84)

In [139]:
#keeping only relevant columns
lin_df = col_keep(lin_df)
con_df = col_keep(con_df)

In [161]:
#data check
lin_df

Unnamed: 0,subreddit,selftext,title
0,linguistics,,The Great Law of Peace is supposed to have bee...
1,linguistics,I am 18 years old right now and I am consideri...,Is there a way I can work with language preser...
2,linguistics,[removed],Why is genitive the second case?
3,linguistics,I keep getting mixed results form just searchi...,what age can you no longer pick up a new accent
4,linguistics,[removed],"Genitives and derived adjectives, Locatives an..."
...,...,...,...
994,linguistics,Because of the former political entity Czechos...,Does Slovak descend from a common Czech/Slovak...
995,linguistics,[removed],Why is it that topics are untouched for years ...
996,linguistics,[removed],We know it's possible to reconstruct proto-lan...
997,linguistics,,About the universal contained in the Lithuania...


In [142]:
#combining dataframes from both subreddits and setting category to 1 for "linguistics" and 0 for "conlangs"
data = pd.concat([lin_df, con_df], ignore_index = True)
data['subreddit'] = data['subreddit'].map(lambda x: 1 if x == 'linguistics' else 0)
data.rename(columns = {'subreddit': 'linguistics'})

Unnamed: 0,linguistics,selftext,title
0,1,,The Great Law of Peace is supposed to have bee...
1,1,I am 18 years old right now and I am consideri...,Is there a way I can work with language preser...
2,1,[removed],Why is genitive the second case?
3,1,I keep getting mixed results form just searchi...,what age can you no longer pick up a new accent
4,1,[removed],"Genitives and derived adjectives, Locatives an..."
...,...,...,...
1991,0,So I love different bases and wanted to see wh...,Base Prime numbers with Python (Code at end of...
1992,0,"If so, why? What part of conlanging do you enj...",Are you interested in programming?
1993,0,"Australia, as with other parts of the world wi...",Australian languages?
1994,0,"Hello, I am currently getting back into conlan...",Ancestral Aunaic


In [156]:
#exporting data to CSV file
data.to_csv('data/posts.csv')

In [159]:
data.shape

(1996, 3)