In [234]:
import pandas as pd
data = pd.read_csv('data/cs_subs.csv')  # unzip this file

In [235]:
len(data['subreddit'].unique())

136

In [236]:
data.shape

(624289, 3)

In [237]:
data.dropna().shape

(624281, 3)

In [238]:
data = data.dropna()

In [239]:
data['subreddit'].value_counts()

Android                61202
learnprogramming       35288
cscareerquestions      32935
Windows10              27726
webdev                 26849
dataisbeautiful        24388
softwaregore           23741
web_design             22159
ProgrammerHumor        19206
learnpython            17634
raspberry_pi           15659
iOSBeta                14508
linux                  14058
javascript             12971
linuxquestions         11464
hackernews             11134
Python                 11119
windows                10132
androiddev             10130
mac                     9841
ios                     9754
arduino                 9603
java                    9401
networking              9378
linux4noobs             8004
androidthemes           7895
chrome                  5862
iOSProgramming          5647
rust                    5489
datascience             5374
                       ...  
redis                    241
dartlang                 240
programmerreactions      237
Julia         

In [240]:
data.sample(10)

Unnamed: 0,title,score,subreddit
232338,Pretty clear who sells upvotes imo,1,dataisbeautiful
310286,Keep in touch with the best web development co...,1,Web_Development
27581,Need help with RTC &amp; SD Code,1,arduino
166547,Help creating a static method - basic java,9,javahelp
67181,NEW EARN BITCOIN SITE!!!! IF THEY PAY WE´LL BE...,1,crypto
481134,importing two components/classes with one impo...,6,angularjs
24426,Spring AMQP 1.7 picks up Spring Boot 1.5 compa...,14,java
514206,"Red says its $1,595 Hydrogen smartphone is 'wo...",0,Android
155341,Here's a collection of companies that don't do...,377,cscareerquestions
489733,Facebook Migrating A Database From InnoDB To M...,16,Database


#### We are filtering subreddits that have less than 150 posts. 

In [241]:
counts = data['subreddit'].value_counts()
counts = counts[counts > 2000]

In [242]:
top_values = list(counts.index)
data = data[data['subreddit'].isin(top_values)]

In [243]:
data['subreddit'].value_counts()

Android                61202
learnprogramming       35288
cscareerquestions      32935
Windows10              27726
webdev                 26849
dataisbeautiful        24388
softwaregore           23741
web_design             22159
ProgrammerHumor        19206
learnpython            17634
raspberry_pi           15659
iOSBeta                14508
linux                  14058
javascript             12971
linuxquestions         11464
hackernews             11134
Python                 11119
windows                10132
androiddev             10130
mac                     9841
ios                     9754
arduino                 9603
java                    9401
networking              9378
linux4noobs             8004
androidthemes           7895
chrome                  5862
iOSProgramming          5647
rust                    5489
datascience             5374
aws                     5112
javahelp                4829
Web_Development         4732
reactjs                 4533
golang        

In [244]:
data.shape

(574249, 3)

In [245]:
data['subreddit'].unique().shape

(57,)

We have a lot of data. Especially for my Macbook. Let's see the average reddit score (upvotes + downvotes) for each subreddit to filter out. I want to do mean and not median since median would just arbitrarily cut the data in half. Hopefully filtering by mean will take relatively larger chunks out of the more popular subreddits than the less popular ones.

In [246]:
means = {}
for subreddit in data['subreddit'].unique():
    means[subreddit] = data[data['subreddit'] == subreddit]['score'].mean()

In [247]:
means

{'Android': 75.0088722590765,
 'Angular2': 5.473166368515206,
 'AskNetsec': 6.923273657289003,
 'C_Programming': 6.593274988484569,
 'Database': 2.3162813575996064,
 'ProgrammerHumor': 460.36457357075915,
 'Python': 19.62010972209731,
 'Web_Development': 1.165680473372781,
 'Windows10': 14.500180336146578,
 'androiddev': 11.89121421520237,
 'androidthemes': 30.770867637745408,
 'arduino': 12.679683432260752,
 'artificial': 7.961448293826518,
 'aws': 6.961463223787168,
 'chrome': 5.607130672125554,
 'coding': 13.21768140116764,
 'compsci': 21.0,
 'computerscience': 5.429937355753379,
 'cpp': 21.44868469803631,
 'cscareerquestions': 9.30821314710794,
 'csharp': 11.75581084691424,
 'css': 5.632745364088648,
 'csshelp': 1.5042659605766402,
 'dataisbeautiful': 316.7866983762506,
 'datascience': 8.189430591737997,
 'django': 5.676748582230624,
 'dotnet': 10.152072072072071,
 'golang': 13.324040219378428,
 'hackernews': 4.136428956349919,
 'haskell': 24.44258453297623,
 'iOSBeta': 11.14088778

In [248]:
import numpy as np


filtered = []

for subreddit in data['subreddit'].unique():
    filtered.append(data.loc[(data['subreddit'] == subreddit) & (data['score'] >= means[subreddit])])

In [249]:
filtered_data = pd.concat(filtered)

In [250]:
filtered_data['subreddit'].value_counts()

Android                6807
linuxquestions         3893
cscareerquestions      3772
learnpython            3081
webdev                 2565
hackernews             2563
iOSBeta                2424
Windows10              2338
linux4noobs            2275
ProgrammerHumor        2194
networking             2174
androiddev             2026
linux                  2013
windows                2005
javascript             1880
learnprogramming       1813
softwaregore           1746
ios                    1737
java                   1672
androidthemes          1664
chrome                 1548
Python                 1474
aws                    1471
rust                   1466
web_design             1361
javahelp               1326
arduino                1205
iOSProgramming         1190
mac                    1150
csshelp                1101
datascience            1081
AskNetsec              1079
golang                 1058
raspberry_pi           1037
reactjs                1030
haskell             

In [251]:
filtered_data.shape

(83852, 3)

In [252]:
filtered_data.drop_duplicates().shape

(83744, 3)

In [253]:
filtered_data = filtered_data.drop_duplicates()

In [254]:
filtered_data.sample(20)

Unnamed: 0,title,score,subreddit
164326,CSS FizzBuzz,15,css
468759,Few questions regarding WPF.,13,csharp
510410,d-carousel: A different way of doing carousels,25,webdev
312040,How to deploy your server-side Swift project t...,52,swift
588683,[Feature] New text for clock icon,70,iOSBeta
466587,std::visit is everything wrong with modern C++,185,cpp
303838,I'm 36 am I too young to learn and change career?,712,learnprogramming
620542,[Discussion] What icons are those?,125,androidthemes
223610,How to manage Apple Membership Programs &amp; ...,21,swift
488893,Find the programmer,9985,ProgrammerHumor


In [255]:
X = filtered_data['title']
y = filtered_data['subreddit']

#### Splitting data into train (60%), val (20%), and test (20%).

In [256]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=17)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=31)
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)
print(y_train.shape)
print(y_test.shape)
print(y_val.shape)

(50246,)
(16749,)
(16749,)
(50246,)
(16749,)
(16749,)


#Baseline
Simple baseline using tf-idf based approaches

In [257]:
from sklearn.preprocessing import LabelEncoder


label_encoder = LabelEncoder()
label_encoder.fit(data['subreddit'])

y_train = label_encoder.transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

In [258]:
len(label_encoder.classes_)

57

In [259]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words='english')
X_train_vectors = vectorizer.fit_transform(X_train)
X_val_vectors = vectorizer.transform(X_val)
X_test_vectors = vectorizer.transform(X_test)

In [260]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


def top_n_accuracy(y_true, probs, n=5):
    top_n_list = []
    for prob in probs:
        top_n_list.append(np.argsort(-prob)[:n])
    predictions = []
    for prediction, top_n in zip(y_true, top_n_list):
        predictions.append(int(prediction in top_n))
    return np.sum(predictions) / y_true.shape[0]

In [261]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_vectors, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [262]:
nb_predictions = nb.predict(X_val_vectors)
nb_probs = nb.predict_proba(X_val_vectors)

In [263]:
print('Top 1 accuracy:\n', top_n_accuracy(y_val, nb_probs, 1))
print('Top 5 accuracy:\n', top_n_accuracy(y_val, nb_probs, 5))
print(classification_report(y_val, nb_predictions))

Top 1 accuracy:
 0.343901128426
Top 5 accuracy:
 0.687444026509
             precision    recall  f1-score   support

          0       0.20      0.99      0.34      1363
          1       0.94      0.32      0.48       180
          2       0.50      0.00      0.01       225
          3       0.00      0.00      0.00       106
          4       0.00      0.00      0.00        67
          5       0.60      0.09      0.16       449
          6       0.67      0.05      0.10       301
          7       0.00      0.00      0.00        53
          8       0.55      0.37      0.44       452
          9       0.74      0.28      0.40       397
         10       0.97      0.40      0.57       326
         11       0.95      0.35      0.52       218
         12       0.94      0.20      0.33       171
         13       0.86      0.56      0.67       281
         14       0.84      0.26      0.40       330
         15       0.00      0.00      0.00        88
         16       0.00      0.00  

  'precision', 'predicted', average, warn_for)


In [174]:
from sklearn.svm import LinearSVC
svm = LinearSVC(penalty='l2', loss='squared_hinge', multi_class='ovr', max_iter=1000)
svm.fit(X_train_vectors, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [175]:
svm_predictions = svm.predict(X_val_vectors)

print(accuracy_score(y_val, svm_predictions))
print(classification_report(y_val, svm_predictions))

0.501288594674
             precision    recall  f1-score   support

          0       0.74      0.87      0.80      1362
          1       0.64      0.69      0.67       180
          2       0.18      0.10      0.13       105
          3       0.36      0.31      0.33       216
          4       0.47      0.50      0.48       125
          5       0.19      0.15      0.17        91
          6       0.81      0.70      0.75       101
          7       0.25      0.20      0.22        69
          8       0.50      0.06      0.10        18
          9       0.65      0.62      0.64        89
         10       0.88      0.78      0.82        18
         11       0.45      0.37      0.41        38
         12       0.61      0.46      0.52       115
         13       0.60      0.23      0.33        13
         14       0.88      0.74      0.80        19
         15       0.33      0.05      0.09        20
         16       0.65      0.60      0.62        52
         17       0.23      0.

In [38]:
from gensim.models import Doc2Vec
from gensim import utils
from gensim.models.doc2vec import TaggedDocument


tagged_documents = []
tokens = None
for text, label in zip(X_train, y_train):
    text = text.lower()
    tokens = utils.simple_preprocess(text)
    tagged_documents.append(TaggedDocument(tokens, [label]))

In [39]:
tagged_documents[:5]

[TaggedDocument(words=['lessons', 'learned', 'from', 'my', 'latest', 'game', 'iteration', 'from', 'stars', 'to', 'over'], tags=[71]),
 TaggedDocument(words=['little', 'asp', 'net', 'core', 'book'], tags=[112]),
 TaggedDocument(words=['responsive', 'background', 'images', 'with', 'javascript'], tags=[111]),
 TaggedDocument(words=['can', 'ssh', 'to', 'virtual', 'machine', 'when', 'ipv', 'is', 'set', 'to', 'automatic', 'dhcp', 'but', 'can', 'ssh', 'when', 'use', 'manual', 'and', 'input', 'my', 'own', 'addresses'], tags=[82]),
 TaggedDocument(words=['conversational', 'dataset'], tags=[51])]

In [40]:
model = Doc2Vec(size=64, window=7, min_count=1, workers=4, iter=20)

In [41]:
model.build_vocab(tagged_documents)
model.train(tagged_documents, total_examples=model.corpus_count, epochs=model.iter)

10405811

In [42]:
X_train_vectors = X_train.map(lambda title: model.infer_vector(utils.simple_preprocess(title))).values

In [43]:
X_train_vectors = np.array(list(X_train_vectors), dtype=np.float)

In [44]:
X_train_vectors.shape

(59364, 64)

In [45]:
from sklearn.svm import LinearSVC
svm = LinearSVC(penalty='l2', loss='squared_hinge', multi_class='ovr', max_iter=1000)
svm.fit(X_train_vectors, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [46]:
X_val_vectors = X_val.map(lambda title: model.infer_vector(utils.simple_preprocess(title))).values
X_val_vectors = np.array(list(X_val_vectors), dtype=np.float)

In [47]:
svm_predictions = svm.predict(X_val_vectors)

In [48]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_val, svm_predictions))
print(classification_report(y_val, svm_predictions))

0.29789276871
             precision    recall  f1-score   support

          0       0.32      0.69      0.44      1362
          1       0.53      0.73      0.61       180
          2       0.00      0.00      0.00       105
          3       0.18      0.07      0.11       216
          4       0.35      0.06      0.10       125
          5       0.00      0.00      0.00        91
          6       0.61      0.50      0.55       101
          7       0.00      0.00      0.00        69
          8       0.00      0.00      0.00        18
          9       0.39      0.27      0.32        89
         10       0.50      0.06      0.10        18
         11       0.00      0.00      0.00        38
         12       0.30      0.10      0.14       115
         13       0.00      0.00      0.00        13
         14       1.00      0.05      0.10        19
         15       0.00      0.00      0.00        20
         16       0.00      0.00      0.00        52
         17       0.08      0.0

  'precision', 'predicted', average, warn_for)


In [49]:
ENGLISH_STOP_WORDS = frozenset([
    "a", "about", "above", "across", "after", "afterwards", "again", "against",
    "all", "almost", "alone", "along", "already", "also", "although", "always",
    "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
    "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
    "around", "as", "at", "back", "be", "became", "because", "become",
    "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
    "below", "beside", "besides", "between", "beyond", "bill", "both",
    "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
    "could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
    "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
    "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
    "everything", "everywhere", "except", "few", "fifteen", "fifty", "fill",
    "find", "fire", "first", "five", "for", "former", "formerly", "forty",
    "found", "four", "from", "front", "full", "further", "get", "give", "go",
    "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
    "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
    "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
    "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
    "latterly", "least", "less", "ltd", "made", "many", "may", "me",
    "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
    "move", "much", "must", "my", "myself", "name", "namely", "neither",
    "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
    "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
    "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
    "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
    "please", "put", "rather", "re", "same", "see", "seem", "seemed",
    "seeming", "seems", "serious", "several", "she", "should", "show", "side",
    "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
    "something", "sometime", "sometimes", "somewhere", "still", "such",
    "system", "take", "ten", "than", "that", "the", "their", "them",
    "themselves", "then", "thence", "there", "thereafter", "thereby",
    "therefore", "therein", "thereupon", "these", "they", "thick", "thin",
    "third", "this", "those", "though", "three", "through", "throughout",
    "thru", "thus", "to", "together", "too", "top", "toward", "towards",
    "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us",
    "very", "via", "was", "we", "well", "were", "what", "whatever", "when",
    "whence", "whenever", "where", "whereafter", "whereas", "whereby",
    "wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
    "who", "whoever", "whole", "whom", "whose", "why", "will", "with",
    "within", "without", "would", "yet", "you", "your", "yours", "yourself",
"yourselves"])


def format_for_fastext(X, y, filename):
    prefix = '__label__'
    f = open(''.join(['data/', filename]), 'w')
    for title, label in zip(X, y):
        title = title.lower()
        tokens = utils.simple_preprocess(title)
        tokens = [token for token in tokens if token not in ENGLISH_STOP_WORDS]
        f.write(''.join([prefix, str(label), ' ', ' '.join(tokens), '\n']))
    f.close()
    
format_for_fastext(X_train, y_train, 'reddit_fasttext_train.txt')
format_for_fastext(X_val, y_val, 'reddit_fasttext_val.txt')
format_for_fastext(X_test, y_test, 'reddit_fasttext_test.txt')

In [57]:
def test_fasttext(y, X, classifier, n=1):
    match = []
    for true, string in zip(y, X):
        predictions = list(classifier.predict(string, n)[0])
        for i in range(n):
            predictions[i] = int(predictions[i].split('__label__')[1])
        match.append(int(true in predictions))
    return np.array(match)

In [184]:
import fastText as fasttext

classifier = fasttext.train_supervised(input='data/reddit_fasttext_train.txt',
                                 lr=0.1,
                                 epoch=30,
                                 dim=64,
                                 minn=2,
                                 maxn=5
                                )

correct = test_fasttext(y_val, X_val, classifier)
correct.sum() / y_val.size

0.32002627722472082

In [53]:
# classifier.save_model('models/fasttext.bin')

In [180]:
from fastText import load_model

classifier = load_model('models/fasttext.bin')

In [183]:
correct = test_fasttext(y_val, X_val, classifier, 10)
correct.sum() / y_val.size

0.72732326039719031