## Project 3: Web APIs & Classification

### Description

In week four we've learned about a few different classifiers. In week five we'll learn about webscraping, APIs, and Natural Language Processing (NLP). Now we're going to put those skills to the test.

For project 3, your goal is two-fold:
1. Using Reddit's API, you'll collect posts from two subreddits of your choosing.
2. You'll then use NLP to train a classifier on which subreddit a given post came from. This is a binary classification problem.

Reddit's API is fairly straightforward. For example, if I want the posts from [`/r/boardgames`](https://www.reddit.com/r/boardgames), all I have to do is add `.json` to the end of the url: https://www.reddit.com/r/boardgames.json

---

In [2]:
import pandas as pd
import numpy as np

from tqdm import tqdm 

import re
import requests

import time
from bs4 import BeautifulSoup

In [3]:
url1 = 'https://www.reddit.com/r/HIMYM.json'
url2 = 'https://www.reddit.com/r/bigbangtheory.json'

In [4]:
header = {'User-agent':'Mild API Request'}

In [5]:
res1 = requests.get(url1, headers=header)
res1.status_code

200

In [6]:
himym_json = res1.json()
himym_json

{'kind': 'Listing',
 'data': {'modhash': '',
  'dist': 25,
  'children': [{'kind': 't3',
    'data': {'approved_at_utc': None,
     'subreddit': 'HIMYM',
     'selftext': '',
     'author_fullname': 't2_8as7ubb5',
     'saved': False,
     'mod_reason_title': None,
     'gilded': 0,
     'clicked': False,
     'title': '"But I... look, I don\'t, I don\'t even know if I\'ll like bacon"',
     'link_flair_richtext': [],
     'subreddit_name_prefixed': 'r/HIMYM',
     'hidden': False,
     'pwls': 0,
     'link_flair_css_class': None,
     'downs': 0,
     'thumbnail_height': 140,
     'top_awarded_type': None,
     'hide_score': False,
     'name': 't3_m5dk47',
     'quarantine': False,
     'link_flair_text_color': 'dark',
     'upvote_ratio': 0.99,
     'author_flair_background_color': None,
     'subreddit_type': 'public',
     'ups': 242,
     'total_awards_received': 0,
     'media_embed': {},
     'thumbnail_width': 140,
     'author_flair_template_id': None,
     'is_original_cont

In [7]:
himym_json.keys()

dict_keys(['kind', 'data'])

In [8]:
himym_json['data'].keys()

dict_keys(['modhash', 'dist', 'children', 'after', 'before'])

In [9]:
len(himym_json['data']['children']) #number of post in one pull

25

In [10]:
himym_json['data']['children'][0]['data']

{'approved_at_utc': None,
 'subreddit': 'HIMYM',
 'selftext': '',
 'author_fullname': 't2_8as7ubb5',
 'saved': False,
 'mod_reason_title': None,
 'gilded': 0,
 'clicked': False,
 'title': '"But I... look, I don\'t, I don\'t even know if I\'ll like bacon"',
 'link_flair_richtext': [],
 'subreddit_name_prefixed': 'r/HIMYM',
 'hidden': False,
 'pwls': 0,
 'link_flair_css_class': None,
 'downs': 0,
 'thumbnail_height': 140,
 'top_awarded_type': None,
 'hide_score': False,
 'name': 't3_m5dk47',
 'quarantine': False,
 'link_flair_text_color': 'dark',
 'upvote_ratio': 0.99,
 'author_flair_background_color': None,
 'subreddit_type': 'public',
 'ups': 242,
 'total_awards_received': 0,
 'media_embed': {},
 'thumbnail_width': 140,
 'author_flair_template_id': None,
 'is_original_content': False,
 'user_reports': [],
 'secure_media': None,
 'is_reddit_media_domain': True,
 'is_meta': False,
 'category': None,
 'secure_media_embed': {},
 'link_flair_text': None,
 'can_mod_post': False,
 'score': 24

In [11]:
himym_json['data']['modhash']

''

In [12]:
himym_json['data']['dist']

25

In [13]:
himym_json['data']['before']

In [14]:
himym_json['data']['after'] #id of the last post in the list

't3_m504zl'

In [15]:
[post['data']['name'] for post in himym_json['data']['children']]

['t3_m5dk47',
 't3_m4xbqg',
 't3_m50wtk',
 't3_m5ggeo',
 't3_m4nauj',
 't3_m5gzf8',
 't3_m529a6',
 't3_m5cxat',
 't3_m4gi9v',
 't3_m50inv',
 't3_m516gi',
 't3_m5cly3',
 't3_m54x6u',
 't3_m55qw4',
 't3_m5dvdx',
 't3_m46ils',
 't3_m58q07',
 't3_m57gzw',
 't3_m52k0f',
 't3_m576sc',
 't3_m59snu',
 't3_m4od13',
 't3_m4szuc',
 't3_m56119',
 't3_m504zl']

In [None]:
#geting 1000 posts

url = 'https://www.reddit.com/r/HIMYM.json'

posts = []
after = None

for a in range(40):
    print(a)
    if after == None:
        current_url = url
    else:
        current_url = url + '?after=' + after
    print(current_url)
    res = requests.get(current_url, headers={'User-agent': 'Mild'})
    
    if res.status_code != 200:
        print('Status error', res.status_code)
        break
    
    current_dict = res.json()
    current_posts = [p['data'] for p in current_dict['data']['children']]
    posts.extend(current_posts)
    after = current_dict['data']['after']
    
    # generate a random sleep duration to look more 'natural'
    sleep_duration = 1
    time.sleep(sleep_duration)

In [17]:
himym = pd.DataFrame(posts)
himym

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,num_crossposts,media,is_video,crosspost_parent_list,crosspost_parent,media_metadata,poll_data,is_gallery,gallery_data,author_cakeday
0,,HIMYM,,t2_8as7ubb5,False,,0,False,"""But I... look, I don't, I don't even know if ...",[],...,0,,False,,,,,,,
1,,HIMYM,,t2_7uxl8ll6,False,,0,False,Farhampton,[],...,0,,False,,,,,,,
2,,HIMYM,,t2_afp3wlbm,False,,0,False,"barney, what do you do for living? tell us 😡",[],...,0,,False,,,,,,,
3,,HIMYM,,t2_7uxl8ll6,False,,0,False,Marshall is so lame sometimes and yet so funny 🙈🙈,[],...,0,,False,,,,,,,
4,,HIMYM,,t2_f6huuo,False,,0,False,This submission on a BuzzFeed article called “...,[],...,0,,False,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
974,,HIMYM,,t2_4cswx,False,,0,False,Looks like Becky is making a killing selling B...,[],...,0,,False,,,,,,,
975,,HIMYM,,t2_14bt1d,False,,0,False,Can we just have a round of applause for Stell...,[],...,0,,False,,,,,,,
976,,HIMYM,,t2_7gsksxvt,False,,0,False,Made me think of Barney,[],...,0,,False,,,,,,,
977,,HIMYM,,t2_2romg182,False,,0,False,Found this little detail while rewatching the ...,[],...,0,,False,,,,,,,


In [18]:
himym.to_csv('himym_test.csv')

In [None]:
url = 'https://www.reddit.com/r/bigbangtheory.json'

posts = []
after = None

for a in range(40):
    print(a)
    if after == None:
        current_url = url
    else:
        current_url = url + '?after=' + after
    print(current_url)
    res = requests.get(current_url, headers={'User-agent': 'Mild'})
    
    if res.status_code != 200:
        print('Status error', res.status_code)
        break
    
    current_dict = res.json()
    current_posts = [p['data'] for p in current_dict['data']['children']]
    posts.extend(current_posts)
    after = current_dict['data']['after']
    
    # generate a random sleep duration to look more 'natural'
    sleep_duration = 1
    time.sleep(sleep_duration)

In [22]:
bigbang = pd.DataFrame(posts)
bigbang

Unnamed: 0,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,link_flair_richtext,...,is_video,url_overridden_by_dest,link_flair_template_id,media_metadata,crosspost_parent_list,crosspost_parent,poll_data,is_gallery,gallery_data,author_cakeday
0,,bigbangtheory,Hi all! \nI made a The Big Bang Theory Discor...,t2_5fhfdvh8,False,,0,False,Official Discord Server for r/bigbangtheory!,[],...,False,,,,,,,,,
1,,bigbangtheory,,t2_6ab53k40,False,,0,False,Just Leonard’s and Amy’s expressions after Pen...,"[{'e': 'text', 't': 'Spoiler'}]",...,False,https://i.imgur.com/exzGe1Q.jpg,dab8aa62-627a-11e5-a14d-12162dee14ed,,,,,,,
2,,bigbangtheory,,t2_al54tgi5,False,,0,False,Awww Sheldon,[],...,False,https://i.redd.it/fgw4r72tp4n61.png,,,,,,,,
3,,bigbangtheory,,t2_av2volx4,False,,0,False,Expected 😂,"[{'e': 'text', 't': 'meme'}]",...,False,https://i.redd.it/8ktfi28gx4n61.jpg,5d4e9c3c-4a35-11eb-9184-0e809bd389b1,,,,,,,
4,,bigbangtheory,,t2_93i3syaj,False,,0,False,bbt memes :),"[{'e': 'text', 't': 'meme'}]",...,False,https://i.redd.it/5i0oyo9wt2n61.jpg,5d4e9c3c-4a35-11eb-9184-0e809bd389b1,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
992,,bigbangtheory,It’s fucking terrible just the normal Big Bang...,t2_5tkm9898,False,,0,False,Young Sheldon Easter egg I noticed,[],...,False,,,,,,,,,
993,,bigbangtheory,,t2_7ejm1wjd,False,,0,False,Always wondered what this was...,[],...,False,https://i.redd.it/qowlfue39k161.jpg,,,,,,,,
994,,bigbangtheory,,t2_6m3x1zgc,False,,0,False,Can anyone help me with this? I dont understan...,[],...,False,,,,,,,,,
995,,bigbangtheory,,t2_8x49icys,False,,0,False,"""Revenge is a dish best served cold"" | In KLINGON",[],...,False,https://www.youtube.com/watch?v=oKTatwGNYoE,,,,,,,,


In [23]:
bigbang.to_csv('bigbang_test.csv')