In [6]:
import pandas as pd
import numpy as np
import requests
import json
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score

# // Getting the data

In [7]:
def get_posts( sub = 'all', num_pages = 4, avoid_distinguished = True, attached = None):
    """
    Returns a list of pages from a subreddit. 
    
    ===========================
    ======= Parameters ========
    ===========================

    sub = 'all' (default): type = string
        The subreddit you want to querry. 
        https://reddit.com/r/{sub}/ 
    -------------------------------------------------------------
    num_pages = 4 (default): type = int
        Number of pages to read from.  
        This also is the number of seconds
        this function takes to run
    -------------------------------------------------------------
    avoid_distinguished = True (default): type = bool
        Whether or not to avoid stickied, archived,
        and admin posts
    -------------------------------------------------------------
    attached = None (default): type = List
        The list that you are appending new data onto.
        Default to make a new list.  
        
    ===========================
    ========  Example =========
    ===========================    
    
    the_posts= get_posts(sub = 'jokes',
                            num_pages=1, 
                            avoid_distinguished=True)
                            
    the_posts= get_posts(sub = 'nosleep',
                            num_pages=1, 
                            avoid_distinguished=True, 
                            attached=the_posts )
    
    >>> Returns a list of ~25 posts from reddit.com/r/jokes and
                    ~25 posts from reddit.com/r/nosleep
    
    
    """
    if attached:
        posts = attached
    else:
        posts = []
    counter = 0
    after = None
    while counter < num_pages:
        if after == None:
            params = {}
        else:
            params = {'after': after}
        res = requests.get(f'https://reddit.com/r/{sub}/.json', params ,headers={'User-agent': 'Dodge Bot 0.1'})
        if(res.status_code!=200):
            print('invalid sub')
            return None
        the_json = res.json()
        if avoid_distinguished:
            page = [child for child in the_json['data'].get('children') if not child['data']['stickied'] and not child['data']['archived'] and not child['data']['distinguished']]
        else:
            page = the_json['data'].get('children')
        posts.extend(page)
        after = the_json['data']['after']
        counter += 1
        time.sleep(1)
    return posts

In [11]:
posts = get_posts(sub='dadjokes', num_pages=100)

In [14]:
def posts_as_DataFrame(posts, features = ['subreddit', 'author', 'title', 'selftext', 'created_utc', 'num_comments']):
    feat_dict = [{feat : post['data'][feat] for feat in features}  for post in posts]
    return pd.DataFrame(feat_dict)

In [16]:
df = posts_as_DataFrame(posts=posts, features=['subreddit', 'author', 'title', 'selftext', 'created_utc', 'num_comments'])
df.head()

Unnamed: 0,author,created_utc,num_comments,selftext,subreddit,title
0,DaShMa_,1527558000.0,49,I take something for it.,dadjokes,"I have kleptomania, but when when it gets bad..."
1,zSilverFox,1527618000.0,5,"He threw sodium chloride at his wife, that's a...",dadjokes,Did you hear about the chemist who was arrested?
2,porichoygupto,1527610000.0,4,She thinks it is grounds for divorce.,dadjokes,My wife is getting sick of me not cleaning the...
3,Bignate1213,1527565000.0,10,It makes my day.,dadjokes,I like the way the Earth rotates.
4,Bignate1213,1527608000.0,10,They haven't had a gig yet.,dadjokes,Have you heard of the band 1023MB?


# // Data Processing and NLP

In [20]:
df['num_comments'].mean()

4.590872698158527

In [31]:
df['high_comments'] = df['num_comments'].apply(lambda x: 1 if x >= 4 else 0)

In [39]:
1 - df.high_comments.value_counts()[1]/len(df.high_comments)

0.7634107285828663

In [33]:
df.head(3)

Unnamed: 0,author,created_utc,num_comments,selftext,subreddit,title,high_comments
0,DaShMa_,1527558000.0,49,I take something for it.,dadjokes,"I have kleptomania, but when when it gets bad...",1
1,zSilverFox,1527618000.0,5,"He threw sodium chloride at his wife, that's a...",dadjokes,Did you hear about the chemist who was arrested?,1
2,porichoygupto,1527610000.0,4,She thinks it is grounds for divorce.,dadjokes,My wife is getting sick of me not cleaning the...,1


# // Classification modeling

In [34]:
# Trying just based on the time / date created:

features = ['created_utc']
X = df[features]
y = df['high_comments']

In [36]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state = 44)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [37]:
logreg.coef_

array([[-7.79079081e-10]])

In [38]:
logreg.score(X_val, y_val)

0.7536

In [40]:
# So... pretty bad using ONLY the time created as a feature, let's go ahead and try some
# basic NLP modeling with CountVectorizer etc.

In [42]:
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer

In [59]:
df.head(3)

Unnamed: 0,author,created_utc,num_comments,selftext,subreddit,title,high_comments
0,DaShMa_,1527558000.0,49,I take something for it.,dadjokes,"I have kleptomania, but when when it gets bad...",1
1,zSilverFox,1527618000.0,5,"He threw sodium chloride at his wife, that's a...",dadjokes,Did you hear about the chemist who was arrested?,1
2,porichoygupto,1527610000.0,4,She thinks it is grounds for divorce.,dadjokes,My wife is getting sick of me not cleaning the...,1


In [77]:
# CountVectorizer first
lang_features = ['title']
X = df[lang_features]
y = df['high_comments']

In [78]:
X.shape

(2498, 1)

In [79]:
y.shape

(2498,)

In [80]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [81]:
# Vectorize
cvec = CountVectorizer(stop_words='english')
X_train_counts = cvec.fit_transform(X_train)
X_test_counts = cvec.transform(X_test)

In [82]:
X_train_counts

<1x1 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [83]:
X_test_counts

<1x1 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>