In [3]:
!pip install psaw

Collecting psaw
  Downloading psaw-0.1.0-py3-none-any.whl (15 kB)
Installing collected packages: psaw
Successfully installed psaw-0.1.0


In [2]:
import pandas as pd

### Scrape Reddit data (posts and comments) on a certain subreddit keyword search

In [4]:
from datetime import datetime
from psaw import PushshiftAPI
import json, pickle
from multiprocessing import Process, Manager
import timeit, random, bz2, lzma
from psaw import PushshiftAPI
import gzip, os


api = PushshiftAPI()

def crawl_subreddit(sub, directory, start_date, end_date, comments = False):
    #Example:  start_epoch: '20200101', end_epoch '20200303'
    start_epoch = datetime.strptime(start_date, '%Y%m%d')
    end_epoch = datetime.strptime(end_date, '%Y%m%d')
    
    prefix = "posts"
    if comments:
        prefix = 'comments'
        
    directory = "{}{}/{}/".format(directory, prefix, sub)
    
    if not os.path.exists(directory):
        os.makedirs(directory)
        
    if comments:
        gen = api.search_comments(subreddit = sub, after = start_epoch, before = end_epoch)
    else:
        gen = api.search_submissions(subreddit = sub, after = start_epoch, before = end_epoch)
        
    
    for dic in gen:
        dic = dic.d_
        created_time = datetime.fromtimestamp(float(dic['created_utc']))
        year = created_time.year
        month = created_time.month
        month_str = str(month)
        if len(month_str) < 2:
            month_str = "0{}".format(month_str)
        with gzip.open("{}{}_jsonlists.gz".format(directory, prefix), "at") as fout:
            fout.write("%s\n" % json.dumps(dic))
            
if __name__ == "__main__":
    crawl_subreddit('lactoseintolerant', 'data/', '20171025', '20221025', comments = False)
    crawl_subreddit('lactoseintolerant', 'data/', '20171025', '20221025', comments = True)

#### 1. first subreddit -'lactoseintolerance' from 10/25/2017 to 10/25/2022

In [41]:
crawl_subreddit('lactoseintolerance', 'data/', '20171025', '20221025', comments = False)
crawl_subreddit('lactoseintolerance', 'data/', '20171025', '20221025', comments = True)



##### Posts

Open as a dataframe from a saved json file for all posts under this subreddit

In [5]:
with open('data/posts/lactoseintolerance/posts_jsonlists', encoding='utf-8') as inputfile:
    df3 = pd.read_json(inputfile, lines = True)


Select columns that I'm interested in 

In [6]:
posts_df2 = df3[['author','created_utc','id','num_comments','permalink','score','title','selftext','subreddit']]

In [7]:
posts_df2.shape

(579, 9)

##### Comments

Open as a dataframe from a saved json file for all comments under this subreddit

In [11]:
with open('data/comments/lactoseintolerance/comments_jsonlists', encoding='utf-8') as inputfile:
    df4 = pd.read_json(inputfile, lines = True)

In [12]:
comments_df2 = df4[['author','created_utc','body','id','parent_id','permalink','score','subreddit']]

In [13]:
comments_df2.shape

(1970, 8)

#### 1. Second subreddit -'lactoseintolerant' from 10/25/2017 to 10/25/2022

In [14]:
with open('data/posts/lactoseintolerant/posts_jsonlists', encoding='utf-8') as inputfile:
    df = pd.read_json(inputfile, lines = True)

#df.to_csv('csvfile.csv', encoding='utf-8', index=False)

In [15]:
df.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,...,edited,steward_reports,removed_by,updated_utc,og_description,og_title,gilded,rte_mode,author_id,brand_safe
0,[],0.0,MyOversoul,,[],,text,t2_xqick,0.0,0.0,...,,,,,,,,,,
1,[],0.0,brainsareforlosers,,[],,text,t2_caai4dga,0.0,0.0,...,,,,,,,,,,
2,[],0.0,menickc,,[],,text,t2_10ymfk,0.0,0.0,...,,,,,,,,,,
3,[],0.0,Kenney93,,[],,text,t2_bxyt048l,0.0,0.0,...,,,,,,,,,,
4,[],0.0,MyToesAreBeans,,[],,text,t2_60ilpllr,0.0,0.0,...,,,,,,,,,,


In [16]:
posts_df = df[['author','created_utc','id','num_comments','permalink','score','title','selftext','subreddit']]

In [17]:
posts_df.head()

Unnamed: 0,author,created_utc,id,num_comments,permalink,score,title,selftext,subreddit
0,MyOversoul,1666641624,ycl0xu,0,/r/lactoseintolerant/comments/ycl0xu/making_la...,1,making lactose free dairy products at home,So I made a big batch of ricotta cheese from g...,lactoseintolerant
1,brainsareforlosers,1666639469,yck563,0,/r/lactoseintolerant/comments/yck563/possible_...,1,possible lactose intolerance??,hi! hope this is the right place to post this!...,lactoseintolerant
2,menickc,1666582795,yc15g9,0,/r/lactoseintolerant/comments/yc15g9/chocolate...,1,Chocolate Milk Kills Me,I eat tons of cheese and dairy products but it...,lactoseintolerant
3,Kenney93,1666566345,ybvo4g,0,/r/lactoseintolerant/comments/ybvo4g/do_we_nee...,1,Do we need to take calcium with vitamin D supp...,"I am allergic to nuts, eggs, strawberries, man...",lactoseintolerant
4,MyToesAreBeans,1666563325,ybukfx,0,/r/lactoseintolerant/comments/ybukfx/milk/,1,Milk,Is white,lactoseintolerant


In [18]:
with open('data/comments/lactoseintolerant/comments_jsonlists', encoding='utf-8') as inputfile:
    df2 = pd.read_json(inputfile, lines = True)

In [19]:
comments_df = df2[['author','created_utc','body','id','parent_id','permalink','score','subreddit']]

In [20]:
comments_df.head()

Unnamed: 0,author,created_utc,body,id,parent_id,permalink,score,subreddit
0,Viking603,1666652618,"Dude. Been there. Ordered a ""hamburger ' in a ...",itnht1x,t3_ybs9yq,/r/lactoseintolerant/comments/ybs9yq/im_going_...,1,lactoseintolerant
1,MyOversoul,1666647377,Interesting thank you for the recipe share!,itn5v15,t1_itn15ih,/r/lactoseintolerant/comments/ycl0xu/making_la...,1,lactoseintolerant
2,matanuki,1666645497,"That's wild, I didn't think you could make ric...",itn15ih,t3_ycl0xu,/r/lactoseintolerant/comments/ycl0xu/making_la...,1,lactoseintolerant
3,[deleted],1666644250,[removed],itmxyeg,t3_ycl0xu,/r/lactoseintolerant/comments/ycl0xu/making_la...,1,lactoseintolerant
4,Shimerald,1666643897,Stomach and gut issues are pretty common with ...,itmx17n,t3_yck563,/r/lactoseintolerant/comments/yck563/possible_...,1,lactoseintolerant


In [21]:
comments_df.shape

(52601, 8)

In [22]:
posts_df.shape

(8743, 9)

#### 1. Third subreddit -'milkmemes' from 10/25/2017 to 10/25/2022

In [54]:
crawl_subreddit('milkmemes', 'data/', '20171025', '20221025', comments = False)
crawl_subreddit('milkmemes', 'data/', '20171025', '20221025', comments = True)

In [23]:
with open('data/posts/milkmemes/posts_jsonlists', encoding='utf-8') as inputfile:
    df5 = pd.read_json(inputfile, lines = True)

In [24]:
with open('data/comments/milkmemes/comments_jsonlists', encoding='utf-8') as inputfile:
    df6 = pd.read_json(inputfile, lines = True)

In [25]:
posts_df3 = df5[['author','created_utc','id','num_comments','permalink','score','title','selftext','subreddit']]
comments_df3 = df6[['author','created_utc','body','id','parent_id','permalink','score','subreddit']]

#### Combine data scraped from these subreddits into one dataframe for posts and comments (separately) - lactose dataframe

In [26]:
lactose_df_posts = pd.concat([posts_df,posts_df2, posts_df3])
lactose_df_posts.shape #9362 posts in total

(9362, 9)

In [27]:
lactose_df_comments = pd.concat([comments_df, comments_df2, comments_df3])
lactose_df_comments.shape #54590 comments in total

(54590, 8)

#### Subreddit -'AncestryDNA' from 10/25/2017 to 10/25/2022

In [98]:
crawl_subreddit('AncestryDNA', 'data/', '20171025', '20221025', comments = False)
crawl_subreddit('AncestryDNA', 'data/', '20171025', '20221025', comments = True)

In [77]:
with open('data/posts/AncestryDNA/posts_jsonlists', encoding='utf-8') as inputfile:
    df7 = pd.read_json(inputfile, lines = True)
with open('data/comments/AncestryDNA/comments_jsonlists', encoding='utf-8') as inputfile:
    df8 = pd.read_json(inputfile, lines = True)

In [78]:
posts_df4 = df7[['author','created_utc','id','num_comments','permalink','score','title','selftext','subreddit']]
comments_df4 = df8[['author','created_utc','body','id','parent_id','permalink','score','subreddit']]

#### Subreddit -'23andMe' from 10/25/2017 to 10/25/2022

In [69]:
crawl_subreddit('23andMe', 'data/', '20171025', '20221025', comments = False)
crawl_subreddit('23andMe', 'data/', '20171025', '20221025', comments = True)



In [79]:
with open('data/posts/23andMe/posts_jsonlists', encoding='utf-8') as inputfile:
    df9 = pd.read_json(inputfile, lines = True)
with open('data/comments/23andMe/comments_jsonlists', encoding='utf-8') as inputfile:
    df10 = pd.read_json(inputfile, lines = True)

In [80]:
posts_df5 = df9[['author','created_utc','id','num_comments','permalink','score','title','selftext','subreddit']]
comments_df5 = df10[['author','created_utc','body','id','parent_id','permalink','score','subreddit']]

#### Subreddit -'MyHeritage' from 10/25/2017 to 10/25/2022

In [28]:
crawl_subreddit('MyHeritage', 'data/', '20171025', '20221025', comments = False)
crawl_subreddit('MyHeritage', 'data/', '20171025', '20221025', comments = True)

In [81]:
with open('data/posts/MyHeritage/posts_jsonlists', encoding='utf-8') as inputfile:
    df11 = pd.read_json(inputfile, lines = True)
with open('data/comments/MyHeritage/comments_jsonlists', encoding='utf-8') as inputfile:
    df12 = pd.read_json(inputfile, lines = True)

In [82]:
posts_df6 = df11[['author','created_utc','id','num_comments','permalink','score','title','selftext','subreddit']]
comments_df6 = df12[['author','created_utc','body','id','parent_id','permalink','score','subreddit']]

#### combine these three subreddits to a big dataframe (genome dataframe)

In [83]:
genome_posts_df = pd.concat([posts_df4, posts_df5, posts_df6])
genome_comments_df = pd.concat([comments_df4, comments_df5, comments_df6])

In [84]:
genome_posts_df.shape

(89101, 9)

In [85]:
genome_comments_df.shape

(1168827, 8)

#### save two dataframes into csv (for both posts and comments)

In [201]:
lactose_df_posts.to_csv('lactose_posts.csv')

In [202]:
lactose_df_comments.to_csv('lactoseintolerant_comments.csv')

In [199]:
# milk_posts_df.to_csv('milk_posts.csv')
# milk_comments_df.to_csv('milk_comments.csv')

In [86]:
genome_posts_df.to_csv('genome_posts.csv')
genome_comments_df.to_csv('genome_comments.csv')

In [89]:
lactose_df_posts = pd.read_csv('lactose_posts.csv')
lactose_df_posts.shape

(9362, 10)

In [90]:
lactose_df_comments = pd.read_csv('lactoseintolerant_comments.csv')
lactose_df_comments.shape

(54590, 9)

In [74]:
genome_posts_df = pd.read_csv('genome_posts.csv')
genome_posts_df.shape

(89101, 10)

In [75]:
genome_comments_df = pd.read_csv('genome_comments.csv')

  genome_comments_df = pd.read_csv('genome_comments.csv')


In [76]:
genome_comments_df.shape

(1206625, 9)

### basic cleaning

In [91]:
lactose_df_posts = lactose_df_posts[lactose_df_posts['author'] != '[deleted]']
lactose_df_comments = lactose_df_comments[lactose_df_comments['author'] != '[deleted]']

In [87]:
genome_posts_df = genome_posts_df[genome_posts_df['author'] != '[deleted]']
genome_comments_df = genome_comments_df[genome_comments_df['author'] != '[deleted]']

In [92]:
lactose_df_posts.shape

(9299, 10)

In [88]:
genome_posts_df.shape

(87118, 9)

In [93]:
def first_clean(text): 
    text = str(text)
    text = text.replace('\\n',' ')
    text = text.replace('&amp',' ')
    text = text.replace(';#x200B;',' ')
    text = text.replace('nbsp',' ')
    
    return text

In [94]:
lactose_df_comments['body'] = lactose_df_comments['body'].apply(lambda x : first_clean(x))
lactose_df_posts['selftext'] = lactose_df_posts['selftext'].apply(lambda x : first_clean(x))
lactose_df_posts['title'] = lactose_df_posts['title'].apply(lambda x : first_clean(x))

In [95]:
genome_comments_df['body'] = genome_comments_df['body'].apply(lambda x : first_clean(x))
genome_posts_df['selftext'] = genome_posts_df['selftext'].apply(lambda x : first_clean(x))
genome_posts_df['title'] = genome_posts_df['title'].apply(lambda x : first_clean(x))

In [96]:
lactose_df_posts['created_utc'] = pd.to_datetime(lactose_df_posts['created_utc'], unit='s')
lactose_df_comments['created_utc'] = pd.to_datetime(lactose_df_comments['created_utc'], unit='s')

In [98]:
genome_posts_df['created_utc'] = pd.to_datetime(genome_posts_df['created_utc'], unit='s')
genome_comments_df['created_utc'] = pd.to_datetime(genome_comments_df['created_utc'], unit='s')

In [99]:
#Data Manipulation and Storage
import pandas as pd
import sqlite3

# Text Cleaning
import string
import re
import nltk 
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer

#Generating n-grams
from gensim.models import Phrases

In [100]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jieyujiao/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [101]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/jieyujiao/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [102]:
wordnet_lemmatizer = WordNetLemmatizer()
stopwords_english = stopwords.words('english')

In [103]:
def clean_text(text): 
    #make string lowercase 
    text = str(text)
    text = text.lower()
    
    #remove links
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)

    #tokenize
    tokens = nltk.word_tokenize(text) 
    clean_text = []
    
    #remove stopwords, puncuation, then lemmatize
    for word in tokens:
        if (word not in stopwords_english and word not in string.punctuation): 
            token = wordnet_lemmatizer.lemmatize(word)
            clean_text.append(token)
            
    #remove words of length 3 or smaller        
    clean_text = [token for token in clean_text if len(token) > 3] 
            
    return clean_text      

In [104]:
lactose_df_comments['body_clean'] = lactose_df_comments['body'].apply(lambda x : clean_text(x))
lactose_df_posts['selftext_clean'] = lactose_df_posts['selftext'].apply(lambda x : clean_text(x))
lactose_df_posts['title_clean'] = lactose_df_posts['title'].apply(lambda x : clean_text(x))

In [106]:
genome_comments_df['body_clean'] = genome_comments_df['body'].apply(lambda x : clean_text(x))
genome_posts_df['selftext_clean'] = genome_posts_df['selftext'].apply(lambda x : clean_text(x))
genome_posts_df['title_clean'] = genome_posts_df['title'].apply(lambda x : clean_text(x))

In [110]:
genome_posts_df.head(20)

Unnamed: 0,author,created_utc,id,num_comments,permalink,score,title,selftext,subreddit,selftext_clean,title_clean
0,isaiah_45__,2022-10-24 23:14:01,ycplol,0,/r/AncestryDNA/comments/ycplol/using_wales_as_...,1,"Using Wales as an example, do the areas and ci...",,AncestryDNA,[],"[using, wale, example, area, city, highlighted..."
1,kb7773,2022-10-24 23:01:39,ycpc0y,0,/r/AncestryDNA/comments/ycpc0y/has_anyone_ever...,1,Has anyone ever had an unusual and surprising ...,send me a dm with your story pls! its for a ne...,AncestryDNA,"[send, story, newspaper, article]","[anyone, ever, unusual, surprising, ancestry/2..."
2,takisandrockstar,2022-10-24 22:51:24,ycp40n,0,/r/AncestryDNA/comments/ycp40n/dna_results_new...,1,DNA results + new community,,AncestryDNA,[],"[result, community]"
3,achkerli,2022-10-24 22:16:27,ycoc5o,0,/r/AncestryDNA/comments/ycoc5o/dad_is_100_perc...,1,Dad is 100 percent British and Irish on 23andM...,,AncestryDNA,[],"[percent, british, irish, 23andme, however, my..."
4,ForgettablePhoenix,2022-10-24 21:59:10,ycnxj6,0,/r/AncestryDNA/comments/ycnxj6/my_results_i_kn...,1,My results. I know Parent 1 is my mom and Pare...,,AncestryDNA,[],"[result, know, parent, parent]"
5,blabbedybloobla,2022-10-24 21:18:45,ycmz69,0,/r/AncestryDNA/comments/ycmz69/why_are_the_col...,1,Why are the colors switched?,,AncestryDNA,[],"[color, switched]"
6,bangtan-bot,2022-10-24 20:05:30,ycl5oj,0,/r/AncestryDNA/comments/ycl5oj/new_communities...,1,New communities update!,,AncestryDNA,[],"[community, update]"
7,WanderingWombats,2022-10-24 19:45:11,ycknf2,0,/r/AncestryDNA/comments/ycknf2/i_struggle_to_r...,1,I struggle to read cursive so I tested OCR han...,[removed],AncestryDNA,[removed],"[struggle, read, cursive, tested, handwriting,..."
8,Fragrant_Ad_7882,2022-10-24 19:01:15,ycjjyq,0,/r/AncestryDNA/comments/ycjjyq/old_results_com...,1,Old Results compared with the new ones + commu...,,AncestryDNA,[],"[result, compared, community, family, oh/nw]"
9,CNickyD,2022-10-24 18:23:10,ycil6i,0,/r/AncestryDNA/comments/ycil6i/this_is_the_pla...,1,This is the plantation my 6th great-grandmothe...,I feel surprisingly emotional about this… Weir...,AncestryDNA,"[feel, surprisingly, emotional, this…, weird, ...","[plantation, great-grandmother, mulatto, slave..."


In [112]:
lactose_df_comments['body_length'] = lactose_df_comments['body_clean'].apply(lambda x : len(x))
lactose_df_posts['selftext_length'] = lactose_df_posts['selftext_clean'].apply(lambda x : len(x))
lactose_df_posts['title_length'] = lactose_df_posts['title_clean'].apply(lambda x : len(x))

lactose_df_comments = lactose_df_comments[lactose_df_comments['body_length'] >= 5]
lactose_df_posts = lactose_df_posts[lactose_df_posts['selftext_length'] >= 5]

In [113]:
genome_comments_df['body_length'] = genome_comments_df['body_clean'].apply(lambda x : len(x))
genome_posts_df['selftext_length'] = genome_posts_df['selftext_clean'].apply(lambda x : len(x))
genome_posts_df['title_length'] = genome_posts_df['title_clean'].apply(lambda x : len(x))

genome_comments_df= genome_comments_df[genome_comments_df['body_length'] >= 5]
genome_posts_df= genome_posts_df[genome_posts_df['selftext_length'] >= 5 ]

In [114]:
lactose_df_comments = lactose_df_comments.drop(columns=['body_length'])
lactose_df_posts = lactose_df_posts.drop(columns=['title_length','selftext_length'])

In [115]:
genome_comments_df = genome_comments_df.drop(columns=['body_length'])
genome_posts_df = genome_posts_df.drop(columns=['title_length','selftext_length'])

In [118]:
docs1 = pd.concat([lactose_df_posts['title_clean'], lactose_df_posts['selftext_clean'], lactose_df_comments['body_clean']])
bigram1 = Phrases(docs1, min_count=10)
trigram1 = Phrases(bigram1[docs1])

In [119]:
docs3 = pd.concat([genome_posts_df['title_clean'], genome_posts_df['selftext_clean'], genome_comments_df['body_clean']])
bigram3 = Phrases(docs3, min_count=10)
trigram3 = Phrases(bigram3[docs3])

In [120]:
#doc is of type - list. Expecting the tokenized sentences 
def add_ngram(doc): 
    return trigram3[bigram3[doc]]

In [121]:
lactose_df_comments['body_ngrams'] = lactose_df_comments['body_clean'].apply(lambda x : add_ngram(x))
lactose_df_posts['selftext_ngrams'] = lactose_df_posts['selftext_clean'].apply(lambda x : add_ngram(x))
lactose_df_posts['title_ngrams'] = lactose_df_posts['title_clean'].apply(lambda x : add_ngram(x))

In [122]:
genome_comments_df['body_ngrams'] = genome_comments_df['body_clean'].apply(lambda x : add_ngram(x))
genome_posts_df['selftext_ngrams'] = genome_posts_df['selftext_clean'].apply(lambda x : add_ngram(x))
genome_posts_df['title_ngrams'] = genome_posts_df['title_clean'].apply(lambda x : add_ngram(x))

In [127]:
lactose_df_posts.shape

(6992, 14)

In [128]:
lactose_df_comments.shape

(41813, 11)

In [126]:
genome_posts_df.shape

(25852, 13)

In [124]:
genome_comments_df.shape

(768767, 10)

In [129]:
lactose_df_posts.to_pickle('lactose_posts')
lactose_df_comments.to_pickle('lactose_comments')

In [None]:
genome_posts_df.to_pickle('genome_posts')
genome_comments_df.to_pickle('genome_comments')