# Setup

In [227]:
import numpy as np
import pandas as pd
import requests
import time
import datetime as dt
import json
import re

# Data Collection

In [21]:
def pushshift(subreddit, post_type, loops=1, size=100):
# subreddit: name of subreddit (str)
# post_type: 'submission' or 'comment'. 
# loops: number of times to request posts (int)
# size: number of posts per request (maximum 100) (int)

    # submissions
    submission = ['id', 'title', 'selftext', 'subreddit']    
    
    #  comments
    comment = ['id', 'body', 'created_utc', 'subreddit']
    
    # instantiate list
    list_posts = [] 
    base_url = "https://api.pushshift.io/reddit/search/{}/?subreddit={}&size={}".format(post_type, subreddit, size)

    # error exception
    if post_type not in ['submission', 'comment']:
        return None
    
    for i in range(loops):
        url = '{}&after={}d'.format(base_url, i) 
        print(i, url)
        res = requests.get(url)
        list_posts.extend(res.json()['data']) 
        time.sleep(1) 
    
    #turn list_posts into a dataframe. each row corresponds to a post
    df_posts = pd.DataFrame.from_dict(list_posts)

    # filter fields
    if post_type == 'submission':
        df_posts = df_posts[submission]
    elif post_type == 'comment':
        df_posts = df_posts[comment]  

    # drop duplicates
    df_posts.drop_duplicates(inplace=True)
    
    # identify post type
    df_posts['post_type'] = post_type
    
    return df_posts

In [22]:
apple = pushshift('apple', post_type='submission', loops=50)
print('shape', apple.shape)
apple.to_csv('apple.csv')

0 https://api.pushshift.io/reddit/search/submission/?subreddit=apple&size=100&after=0d
1 https://api.pushshift.io/reddit/search/submission/?subreddit=apple&size=100&after=1d
2 https://api.pushshift.io/reddit/search/submission/?subreddit=apple&size=100&after=2d
3 https://api.pushshift.io/reddit/search/submission/?subreddit=apple&size=100&after=3d
4 https://api.pushshift.io/reddit/search/submission/?subreddit=apple&size=100&after=4d
5 https://api.pushshift.io/reddit/search/submission/?subreddit=apple&size=100&after=5d
6 https://api.pushshift.io/reddit/search/submission/?subreddit=apple&size=100&after=6d
7 https://api.pushshift.io/reddit/search/submission/?subreddit=apple&size=100&after=7d
8 https://api.pushshift.io/reddit/search/submission/?subreddit=apple&size=100&after=8d
9 https://api.pushshift.io/reddit/search/submission/?subreddit=apple&size=100&after=9d
10 https://api.pushshift.io/reddit/search/submission/?subreddit=apple&size=100&after=10d
11 https://api.pushshift.io/reddit/search

In [23]:
microsoft = pushshift('microsoft', post_type='submission', loops=50)
print('shape', microsoft.shape)
apple.to_csv('microsoft.csv')

0 https://api.pushshift.io/reddit/search/submission/?subreddit=microsoft&size=100&after=0d
1 https://api.pushshift.io/reddit/search/submission/?subreddit=microsoft&size=100&after=1d
2 https://api.pushshift.io/reddit/search/submission/?subreddit=microsoft&size=100&after=2d
3 https://api.pushshift.io/reddit/search/submission/?subreddit=microsoft&size=100&after=3d
4 https://api.pushshift.io/reddit/search/submission/?subreddit=microsoft&size=100&after=4d
5 https://api.pushshift.io/reddit/search/submission/?subreddit=microsoft&size=100&after=5d
6 https://api.pushshift.io/reddit/search/submission/?subreddit=microsoft&size=100&after=6d
7 https://api.pushshift.io/reddit/search/submission/?subreddit=microsoft&size=100&after=7d
8 https://api.pushshift.io/reddit/search/submission/?subreddit=microsoft&size=100&after=8d
9 https://api.pushshift.io/reddit/search/submission/?subreddit=microsoft&size=100&after=9d
10 https://api.pushshift.io/reddit/search/submission/?subreddit=microsoft&size=100&after=1

In [31]:
facebook = pushshift('facebook', post_type='submission', loops=50)
print('shape', facebook.shape)
facebook.to_csv('facebook.csv')

0 https://api.pushshift.io/reddit/search/submission/?subreddit=facebook&size=100&after=0d
1 https://api.pushshift.io/reddit/search/submission/?subreddit=facebook&size=100&after=1d
2 https://api.pushshift.io/reddit/search/submission/?subreddit=facebook&size=100&after=2d
3 https://api.pushshift.io/reddit/search/submission/?subreddit=facebook&size=100&after=3d
4 https://api.pushshift.io/reddit/search/submission/?subreddit=facebook&size=100&after=4d
5 https://api.pushshift.io/reddit/search/submission/?subreddit=facebook&size=100&after=5d
6 https://api.pushshift.io/reddit/search/submission/?subreddit=facebook&size=100&after=6d
7 https://api.pushshift.io/reddit/search/submission/?subreddit=facebook&size=100&after=7d
8 https://api.pushshift.io/reddit/search/submission/?subreddit=facebook&size=100&after=8d
9 https://api.pushshift.io/reddit/search/submission/?subreddit=facebook&size=100&after=9d
10 https://api.pushshift.io/reddit/search/submission/?subreddit=facebook&size=100&after=10d
11 https

In [33]:
amazon = pushshift('amazon', post_type='submission', loops=90)
print('shape', amazon.shape)
amazon.to_csv('amazon.csv')

0 https://api.pushshift.io/reddit/search/submission/?subreddit=amazon&size=100&after=0d
1 https://api.pushshift.io/reddit/search/submission/?subreddit=amazon&size=100&after=1d
2 https://api.pushshift.io/reddit/search/submission/?subreddit=amazon&size=100&after=2d
3 https://api.pushshift.io/reddit/search/submission/?subreddit=amazon&size=100&after=3d
4 https://api.pushshift.io/reddit/search/submission/?subreddit=amazon&size=100&after=4d
5 https://api.pushshift.io/reddit/search/submission/?subreddit=amazon&size=100&after=5d
6 https://api.pushshift.io/reddit/search/submission/?subreddit=amazon&size=100&after=6d
7 https://api.pushshift.io/reddit/search/submission/?subreddit=amazon&size=100&after=7d
8 https://api.pushshift.io/reddit/search/submission/?subreddit=amazon&size=100&after=8d
9 https://api.pushshift.io/reddit/search/submission/?subreddit=amazon&size=100&after=9d
10 https://api.pushshift.io/reddit/search/submission/?subreddit=amazon&size=100&after=10d
11 https://api.pushshift.io/re

In [32]:
google = pushshift('google', post_type='submission', loops=70)
print('shape', google.shape)
google.to_csv('google.csv')

0 https://api.pushshift.io/reddit/search/submission/?subreddit=google&size=100&after=0d
1 https://api.pushshift.io/reddit/search/submission/?subreddit=google&size=100&after=1d
2 https://api.pushshift.io/reddit/search/submission/?subreddit=google&size=100&after=2d
3 https://api.pushshift.io/reddit/search/submission/?subreddit=google&size=100&after=3d
4 https://api.pushshift.io/reddit/search/submission/?subreddit=google&size=100&after=4d
5 https://api.pushshift.io/reddit/search/submission/?subreddit=google&size=100&after=5d
6 https://api.pushshift.io/reddit/search/submission/?subreddit=google&size=100&after=6d
7 https://api.pushshift.io/reddit/search/submission/?subreddit=google&size=100&after=7d
8 https://api.pushshift.io/reddit/search/submission/?subreddit=google&size=100&after=8d
9 https://api.pushshift.io/reddit/search/submission/?subreddit=google&size=100&after=9d
10 https://api.pushshift.io/reddit/search/submission/?subreddit=google&size=100&after=10d
11 https://api.pushshift.io/re

In [241]:
df = pd.concat([facebook[['title', 'selftext', 'subreddit']], 
                apple[['title', 'selftext', 'subreddit']],
                amazon[['title', 'selftext', 'subreddit']],
                google[['title', 'selftext', 'subreddit']],
                microsoft[['title', 'selftext', 'subreddit']]]
               ,ignore_index=True)
df.to_csv('data.csv', index=False)

In [242]:
df

Unnamed: 0,title,selftext,subreddit
0,Account Disabled by Facebook - what might be t...,Hello to everyone around.\n\n&amp;#x200B;\n\nI...,facebook
1,Facebook apologises for Plymouth Hoe 'error',,facebook
2,Can't log in to access code generator,Hi. My FB account got hacked 3 days ago and af...,facebook
3,"Facebook- Deleted old account, created new. No...","Yes, I had some account suspensions with my ol...",facebook
4,Why all countries should ban Facebook and Twitter,,facebook
...,...,...,...
23646,These Roborock Robot Vacuum Deals Will Suck Up...,,microsoft
23647,Save a ton of dough on Certified Refurbished A...,,microsoft
23648,How can I contact Microsoft regarding my email?,[removed],microsoft
23649,Save Some Coin on Your Smart Home with Google ...,,microsoft


In [243]:
df['subreddit'].value_counts()

facebook     4932
apple        4851
microsoft    4809
google       4688
amazon       4371
Name: subreddit, dtype: int64

# Cleaning + EDA

In [244]:
df.isnull().sum()

title          0
selftext     301
subreddit      0
dtype: int64

In [245]:
df.duplicated().sum()

1063

In [246]:
df.drop_duplicates(inplace = True)

In [247]:
df = df.fillna('[removed]')

In [248]:
df['subreddit'].value_counts()

facebook     4813
apple        4770
microsoft    4461
google       4428
amazon       4116
Name: subreddit, dtype: int64

### Map results to binary values

In [249]:
df['target'] = df['subreddit'].map({'facebook': 0, 
                                    'apple': 1,
                                    'amazon' : 2,
                                    'google' : 3,
                                    'microsoft' : 4
                                   })
df.drop(['subreddit'], axis=1, inplace=True)
df

Unnamed: 0,title,selftext,target
0,Account Disabled by Facebook - what might be t...,Hello to everyone around.\n\n&amp;#x200B;\n\nI...,0
1,Facebook apologises for Plymouth Hoe 'error',,0
2,Can't log in to access code generator,Hi. My FB account got hacked 3 days ago and af...,0
3,"Facebook- Deleted old account, created new. No...","Yes, I had some account suspensions with my ol...",0
4,Why all countries should ban Facebook and Twitter,,0
...,...,...,...
23646,These Roborock Robot Vacuum Deals Will Suck Up...,,4
23647,Save a ton of dough on Certified Refurbished A...,,4
23648,How can I contact Microsoft regarding my email?,[removed],4
23649,Save Some Coin on Your Smart Home with Google ...,,4


### redditcleaner, tokenization, and lemmatizer

In [250]:
import redditcleaner
df['selftext'] = df['selftext'].map(redditcleaner.clean)
#redditcleaner is an open source python module for cleaning reddit data.
'''
parameters = {newline=True, quote=True, bullet_point=True, 
          link=True, strikethrough=True, spoiler=True,
          code=True, superscript=True, table=True, heading=True}
'''

'\nparameters = {newline=True, quote=True, bullet_point=True, \n          link=True, strikethrough=True, spoiler=True,\n          code=True, superscript=True, table=True, heading=True}\n'

In [251]:
df

Unnamed: 0,title,selftext,target
0,Account Disabled by Facebook - what might be t...,Hello to everyone around. I'm facing this pro...,0
1,Facebook apologises for Plymouth Hoe 'error',,0
2,Can't log in to access code generator,Hi. My FB account got hacked 3 days ago and af...,0
3,"Facebook- Deleted old account, created new. No...","Yes, I had some account suspensions with my ol...",0
4,Why all countries should ban Facebook and Twitter,,0
...,...,...,...
23646,These Roborock Robot Vacuum Deals Will Suck Up...,,4
23647,Save a ton of dough on Certified Refurbished A...,,4
23648,How can I contact Microsoft regarding my email?,[removed],4
23649,Save Some Coin on Your Smart Home with Google ...,,4


In [252]:
#tokenize data
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [253]:
df['title'] = df['title'].apply(lambda x: tokenizer.tokenize(x.lower()))
df['selftext'] = df['selftext'].apply(lambda x: tokenizer.tokenize(x.lower()))

In [254]:
#remove any text under 2 letters
def rm_len_2(text):
    rm_text = ' '.join([w for w in text.split() if len(w)>2])
    return rm_text

#remove numbers

def rm_num(text):
    rm_num = re.sub(r'\d+', '', text)
    return rm_num

In [255]:
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

lemmatizer = WordNetLemmatizer()

def word_lemmatizer(text):
    lem_text = " ".join([lemmatizer.lemmatize(i) for i in text])
    return lem_text

In [256]:
df['title'] = df['title'].apply(lambda x : word_lemmatizer(x))
df['selftext'] = df['selftext'].apply(lambda x : word_lemmatizer(x))

In [257]:
df['title'] = df['title'].apply(lambda x : rm_len_2(x))
df['selftext'] = df['selftext'].apply(lambda x : rm_len_2(x))

In [258]:
df['title'] = df['title'].apply(lambda x : rm_num(x))
df['selftext'] = df['selftext'].apply(lambda x : rm_num(x))

In [259]:
df

Unnamed: 0,title,selftext,target
0,account disabled facebook what might the reaso...,hello everyone around facing this problem and ...,0
1,facebook apologises for plymouth hoe error,,0
2,can log access code generator,account got hacked day ago and after finding o...,0
3,facebook deleted old account created new now c...,yes had some account suspension with old accou...,0
4,why all country should ban facebook and twitter,,0
...,...,...,...
23646,these roborock robot vacuum deal will suck dir...,,4
23647,save ton dough certified refurbished acer prod...,,4
23648,how can contact microsoft regarding email,removed,4
23649,save some coin your smart home with google nes...,,4


In [260]:
#note: add stopwords: {removed, deleted}

In [261]:
df['text'] = df['title'] + '' + df['selftext']
df.drop(['title', 'selftext'], axis=1, inplace=True)
df = df[['text', 'target']]

In [262]:
df.to_csv('data_clean.csv', index=False)

# Most frequent words

In [25]:
def most_freq_words(num):
    
    count_vect = CountVectorizer(analyzer = "word", 
                             tokenizer = None, 
                             preprocessor = None,
                             stop_words = "english", 
                             max_features = 35)
    
    # input for CountVectorizer is an array of strings
    vector_input = df[df['target'] == num]['text']

    # fit_transform the vectorizer
    words = count_vect.fit_transform(vector_input)

    # convert output to a Numpy array
    words = words.toarray()
    
    # get the words
    word_list = count_vect.get_feature_names()
    print(word_list)
    
    #save matrix
    matrix = pd.DataFrame(words, columns=count_vect.get_feature_names())
    print('\n')
    print('Top 15 Most Frequent Words')
    print(matrix.sum().sort_values(ascending=False).head(15))

In [26]:
#facebook
most_freq_words(0)

['account', 'app', 'business', 'day', 'disabled', 'doe', 'don', 'email', 'facebook', 'friend', 'got', 'group', 'help', 'just', 'know', 'like', 'message', 'messenger', 'new', 'notification', 'old', 'page', 'password', 'people', 'phone', 'photo', 'post', 'profile', 'say', 'time', 'tried', 'use', 'video', 'want', 'way']


Top 15 Most Frequent Words
facebook     3276
account      2122
page         1042
post          933
friend        686
messenger     610
just          607
message       573
help          559
don           535
email         535
know          522
group         514
like          514
new           508
dtype: int64


In [27]:
#apple
most_freq_words(1)

['air', 'airpods', 'amp', 'app', 'apple', 'apps', 'best', 'doe', 'free', 'help', 'icloud', 'ipad', 'iphone', 'iphoneremoved', 'just', 'like', 'mac', 'macbook', 'max', 'mini', 'need', 'new', 'phone', 'photo', 'pro', 'question', 'screen', 'store', 'support', 'tech', 'thread', 'time', 'use', 'using', 'watch']


Top 15 Most Frequent Words
apple      1508
iphone      618
pro         438
app         436
macbook     398
new         333
airpods     312
ipad        227
mac         227
max         217
support     192
help        162
time        152
air         150
doe         132
dtype: int64


In [28]:
#amazon
most_freq_words(2)

['account', 'amazon', 'amazonremoved', 'best', 'buy', 'card', 'cardremoved', 'customer', 'day', 'deal', 'delivery', 'doe', 'free', 'gift', 'got', 'help', 'item', 'just', 'know', 'need', 'new', 'order', 'orderremoved', 'package', 'prime', 'product', 'question', 'refund', 'return', 'review', 'seller', 'shipping', 'time', 'want', 'way']


Top 15 Most Frequent Words
amazon           1894
gift              284
card              213
prime             197
order             196
amazonremoved     192
account           190
item              172
doe               153
free              150
delivery          150
help              145
question          136
return            135
package           129
dtype: int64


In [29]:
#google
most_freq_words(3)

['account', 'amp', 'android', 'app', 'change', 'chrome', 'day', 'doe', 'don', 'drive', 'email', 'gmail', 'google', 'got', 'help', 'just', 'know', 'like', 'make', 'need', 'new', 'phone', 'photo', 'play', 'really', 'search', 'time', 'use', 'using', 'video', 'want', 'way', 'work', 'year', 'youtube']


Top 15 Most Frequent Words
google     2890
search      361
account     351
just        302
new         302
like        249
help        238
phone       236
app         226
youtube     212
know        210
amp         188
time        181
doe         181
chrome      180
dtype: int64


In [30]:
#microsoft
most_freq_words(4)

['account', 'airpods', 'app', 'apple', 'best', 'bitcoin', 'business', 'buy', 'crypto', 'game', 'geek', 'google', 'help', 'high', 'hit', 'iphone', 'launch', 'mac', 'market', 'microsoft', 'new', 'news', 'price', 'pro', 'review', 'say', 'stock', 'store', 'support', 'time', 'use', 'watch', 'window', 'xbox', 'year']


Top 15 Most Frequent Words
apple        1068
bitcoin      1031
microsoft     556
new           338
iphone        223
price         192
review        186
best          172
window        156
pro           147
geek          132
time          128
watch         126
say           123
year          118
dtype: int64
