# Data Import

In [164]:
import pandas as pd
import numpy as np

data = pd.read_csv('data/cs_subs.csv')  # unzip this file

In [166]:
print('Number of unique subreddits:', len(data['subreddit'].unique()))

Number of unique subreddits: 136


In [167]:
print('Total # of posts:', data.shape)

Total # of posts: (624289, 3)


In [169]:
print('Number of posts after dropping null values:', data.dropna().shape)

Number of posts after dropping null values: (624281, 3)


In [170]:
data = data.dropna()

###### Looking at the number of posts per subreddit

In [34]:
data['subreddit'].value_counts()

Android                61202
learnprogramming       35288
cscareerquestions      32935
Windows10              27726
webdev                 26849
dataisbeautiful        24388
softwaregore           23741
web_design             22159
ProgrammerHumor        19206
learnpython            17634
raspberry_pi           15659
iOSBeta                14508
linux                  14058
javascript             12971
linuxquestions         11464
hackernews             11134
Python                 11119
windows                10132
androiddev             10130
mac                     9841
ios                     9754
arduino                 9603
java                    9401
networking              9378
linux4noobs             8004
androidthemes           7895
chrome                  5862
iOSProgramming          5647
rust                    5489
datascience             5374
                       ...  
redis                    241
dartlang                 240
programmerreactions      237
Julia         

##### Sampling data

In [35]:
data.sample(10)

Unnamed: 0,title,score,subreddit
277178,Copy selected text to clipboard and paste it i...,2,learnpython
358274,My Best JavaScript Learning Resources,1,javascript
246508,Why am I printing an extra newline?,2,learnpython
84413,IT Managed Services Georgia,1,softwaredevelopment
472977,Google Maps High Memory Usage,1,Android
325749,Is there a Chrome extension that can clean up ...,2,chrome
108572,"I know the basics of Javascript, HTML, CSS. I'...",2,learnprogramming
48704,E-commerce website development,1,webdev
585244,"Samsung 8, LG 6 or Pixel?",1,Android
117937,Hypermac concert cable to USB c,1,mac


##### We are filtering out subreddits that have less than 150 posts. 

In [36]:
counts = data['subreddit'].value_counts()
counts = counts[counts > 150]

In [37]:
top_values = list(counts.index)
data = data[data['subreddit'].isin(top_values)]

In [38]:
data['subreddit'].value_counts()

Android                61202
learnprogramming       35288
cscareerquestions      32935
Windows10              27726
webdev                 26849
dataisbeautiful        24388
softwaregore           23741
web_design             22159
ProgrammerHumor        19206
learnpython            17634
raspberry_pi           15659
iOSBeta                14508
linux                  14058
javascript             12971
linuxquestions         11464
hackernews             11134
Python                 11119
windows                10132
androiddev             10130
mac                     9841
ios                     9754
arduino                 9603
java                    9401
networking              9378
linux4noobs             8004
androidthemes           7895
chrome                  5862
iOSProgramming          5647
rust                    5489
datascience             5374
                       ...  
windowsinsiders          683
jquery                   667
operabrowser             647
browsers      

In [39]:
data.shape

(622909, 3)

In [40]:
data['subreddit'].unique().shape

(117,)

We have a lot of data. Especially for my Macbook. Let's see the average reddit score (upvotes + downvotes) for each subreddit to filter out. I want to do mean and not median since median would just arbitrarily cut the data in half. Hopefully filtering by mean will take relatively larger chunks out of the more popular subreddits than the less popular ones.

In [41]:
means = {}
for subreddit in data['subreddit'].unique():
    means[subreddit] = data[data['subreddit'] == subreddit]['score'].mean()

In [46]:
import numpy as np


filtered = []

for subreddit in data['subreddit'].unique():
    filtered.append(data.loc[(data['subreddit'] == subreddit) & (data['score'] >= means[subreddit])])

In [47]:
filtered_data = pd.concat(filtered)

In [48]:
filtered_data['subreddit'].value_counts()

Android                6807
linuxquestions         3893
cscareerquestions      3772
learnpython            3081
webdev                 2565
hackernews             2563
iOSBeta                2424
Windows10              2338
linux4noobs            2275
ProgrammerHumor        2194
networking             2174
androiddev             2026
linux                  2013
windows                2005
javascript             1880
learnprogramming       1813
softwaregore           1746
ios                    1737
java                   1672
androidthemes          1664
chrome                 1548
Python                 1474
aws                    1471
rust                   1466
web_design             1361
javahelp               1326
arduino                1205
iOSProgramming         1190
mac                    1150
csshelp                1101
                       ... 
operabrowser            221
mongodb                 197
macapps                 189
windowsinsiders         189
LanguageTechnology  

In [49]:
filtered_data.shape

(99057, 3)

In [50]:
filtered_data.drop_duplicates().shape

(98941, 3)

In [51]:
filtered_data = filtered_data.drop_duplicates()

In [52]:
filtered_data.sample(20)

Unnamed: 0,title,score,subreddit
40902,App Transport Security app approval exemption,11,iOSProgramming
590511,Bad Parameter Sniffing Decision Flow Chart,6,SQLServer
95569,Tesla Passes Ford by Market Value,7,hackernews
443359,GNU Personal Expense Manager,13,linux4noobs
122426,PHP Traits…a wonderful home for shared Laravel...,22,laravel
461628,Node v8.5.0 (Current),42,node
613160,Collectively – an open platform to enhance com...,30,opensource
523967,Gigabit Router/Firewall for Small Business,10,networking
523467,HTTPS: Private Keys on Web Servers,18,crypto
149315,Painting my room with WebVR,49,webdev


In [54]:
X = filtered_data['title']
y = filtered_data['subreddit']

Adapted from https://github.com/keras-team/keras/blob/master/examples/imdb_fasttext.py

In [159]:
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence


def create_ngram_set(input_list, ngram_value=2):
    """
    Extract a set of n-grams from a list of integers.
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
    {(4, 9), (4, 1), (1, 4), (9, 4)}
    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
    """
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))

def add_ngram(sequences, token_indice, ngram_range=2):
    """
    Augment the input list of list (sequences) by appending n-grams values.
    Example: adding bi-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
    >>> add_ngram(sequences, token_indice, ngram_range=2)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
    Example: adding tri-gram
    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
    >>> add_ngram(sequences, token_indice, ngram_range=3)
    [[1, 3, 4, 5, 1337], [1, 3, 7, 9, 2, 1337, 2018]]
    """
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for i in range(len(new_list) - ngram_range + 1):
            for ngram_value in range(2, ngram_range + 1):
                ngram = tuple(new_list[i:i + ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)

    return new_sequences

def create_indices(X, ngram_range, max_features):
    ngram_set = set()
    for input_list in X:
        for i in range(2, ngram_range + 1):
            set_of_ngram = create_ngram_set(input_list, ngram_value=i)
            ngram_set.update(set_of_ngram)


    start_index = max_features + 1
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}

    max_features = np.max(list(indice_token.keys())) + 1
        
    return token_indice, max_features

def preprocess(X, y, train=False, ngram_range=2, max_len=30, max_features=1000, **kwargs):
    
    if train:
        tokenizer = Tokenizer(num_words=max_features)
        tokenizer.fit_on_texts(X)
        
        label_encoder = LabelEncoder()
        label_encoder.fit(y)
        
        label_binarizer = LabelBinarizer()
        label_binarizer.fit(y)
        
        token_indice = None
    else:
        tokenizer = kwargs['tokenizer']
        label_encoder = kwargs['label_encoder']
        label_binarizer = kwargs['label_binarizer']

    X = tokenizer.texts_to_sequences(X)
    y = label_encoder.transform(y)
    y = label_binarizer.transform(y)
    
    if ngram_range > 1:
        if train:
            token_indice, max_features = create_indices(X, ngram_range, max_features)
        else:
            token_indice = kwargs['token_indice']
        
        X = add_ngram(X, token_indice, ngram_range)
        
    X = sequence.pad_sequences(X, maxlen=max_len)
    
    if train:
        return X, y, tokenizer, label_encoder, label_binarizer, token_indice, max_features
    return X, y

##### Splitting data into train (60%), val (20%), and test (20%) and preprocessing

In [163]:
from sklearn.model_selection import train_test_split
import pickle

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=17)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=31)
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)
print(y_train.shape)
print(y_test.shape)
print(y_val.shape)

ngram_range = 2
max_features = 1000
max_len = 20

X_train, y_train, tokenizer, label_encoder, label_binarizer, token_indice, max_features = preprocess(
    X_train, y_train, train=True, ngram_range=ngram_range, max_features=max_features, max_len=max_len)

processors = {
    'tokenizer': tokenizer,
    'label_binarizer': label_binarizer,
    'label_encoder': label_encoder,
    'token_indice': token_indice
}

X_val, y_val = preprocess(
    X_val, y_val, ngram_range=ngram_range, max_len=max_len, **processors)

X_test, y_test = preprocess(
    X_test, y_test, ngram_range=ngram_range, max_len=max_len, **processors)

print('max_features:', max_features)

(59364,)
(19788,)
(19789,)
(59364,)
(19788,)
(19789,)
max_features: 93679


In [156]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import GlobalAveragePooling1D
from keras import callbacks
import keras.backend as K

batch_size = 64
embedding_dims = 30
epochs = 1000

model = Sequential()

model.add(Embedding(max_features, embedding_dims, input_length=X_train.shape[1]))
model.add(GlobalAveragePooling1D())
model.add(Dense(y_train.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Callbacks
early_stopping = callbacks.EarlyStopping(monitor='val_loss', 
                                         min_delta=0.001,
                                         patience=2,
                                         mode='min')

get_best = callbacks.ModelCheckpoint(monitor='val_loss',
                                     filepath='models/keras_fasttext.hdf5',
                                     save_best_only=True)

reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss',
                                        factor=0.0001,
                                        min_lr=0.001)

model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[early_stopping, get_best, reduce_lr],
          validation_data=[X_val, y_val])

Train on 59364 samples, validate on 19789 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000


<keras.callbacks.History at 0x1a22a86c88>

In [137]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


def top_n_accuracy(y_true, probs, n=5):
    top_n_list = []
    for prob in probs:
        top_n_list.append(np.argsort(-prob)[:n])
    predictions = []
    for prediction, top_n in zip(y_true, top_n_list):
        predictions.append(int(prediction in top_n))
    return np.sum(predictions) / y_true.shape[0]