In [1]:
import pandas as pd
data = pd.read_csv('data/cs_subs.csv')  # unzip this file

In [2]:
len(data['subreddit'].unique())

136

In [3]:
data.shape

(624289, 3)

In [4]:
data.dropna().shape

(624281, 3)

In [5]:
data = data.dropna()

In [6]:
data['subreddit'].value_counts()

Android                61202
learnprogramming       35288
cscareerquestions      32935
Windows10              27726
webdev                 26849
dataisbeautiful        24388
softwaregore           23741
web_design             22159
ProgrammerHumor        19206
learnpython            17634
raspberry_pi           15659
iOSBeta                14508
linux                  14058
javascript             12971
linuxquestions         11464
hackernews             11134
Python                 11119
windows                10132
androiddev             10130
mac                     9841
ios                     9754
arduino                 9603
java                    9401
networking              9378
linux4noobs             8004
androidthemes           7895
chrome                  5862
iOSProgramming          5647
rust                    5489
datascience             5374
                       ...  
redis                    241
dartlang                 240
programmerreactions      237
Julia         

In [7]:
data.sample(10)

Unnamed: 0,title,score,subreddit
356711,Does anyone with experience in advertising kno...,1,web_design
337044,Windows 10 help,1,Windows10
330488,Bash does not automatically start when opening...,1,raspberry_pi
549498,Clustering your Elixir application on AWS insi...,1,elixir
496264,Google Assistant comes to the Nvidia Shield TV...,2,Android
445789,"Microservices with Docker, Flask, and React - ...",51,webdev
463082,Red Hat on the recent bluetooth security problem,25,linux
163424,Happy about my new vanity plates.,1,linux
503040,Software to copy manually added songs from iPh...,2,ios
22671,What is your experience with bcachefs so far? ...,15,linux


#### We are filtering subreddits that have less than 150 posts. 

In [8]:
counts = data['subreddit'].value_counts()
counts = counts[counts > 2000]

In [9]:
top_values = list(counts.index)
data = data[data['subreddit'].isin(top_values)]

In [10]:
data['subreddit'].value_counts()

Android                61202
learnprogramming       35288
cscareerquestions      32935
Windows10              27726
webdev                 26849
dataisbeautiful        24388
softwaregore           23741
web_design             22159
ProgrammerHumor        19206
learnpython            17634
raspberry_pi           15659
iOSBeta                14508
linux                  14058
javascript             12971
linuxquestions         11464
hackernews             11134
Python                 11119
windows                10132
androiddev             10130
mac                     9841
ios                     9754
arduino                 9603
java                    9401
networking              9378
linux4noobs             8004
androidthemes           7895
chrome                  5862
iOSProgramming          5647
rust                    5489
datascience             5374
aws                     5112
javahelp                4829
Web_Development         4732
reactjs                 4533
golang        

In [11]:
data.shape

(574249, 3)

In [12]:
data['subreddit'].unique().shape

(57,)

We have a lot of data. Especially for my Macbook. Let's see the average reddit score (upvotes + downvotes) for each subreddit to filter out. I want to do mean and not median since median would just arbitrarily cut the data in half. Hopefully filtering by mean will take relatively larger chunks out of the more popular subreddits than the less popular ones.

In [13]:
means = {}
for subreddit in data['subreddit'].unique():
    means[subreddit] = data[data['subreddit'] == subreddit]['score'].mean()

In [14]:
means

{'Android': 75.0088722590765,
 'Angular2': 5.473166368515206,
 'AskNetsec': 6.923273657289003,
 'C_Programming': 6.593274988484569,
 'Database': 2.3162813575996064,
 'ProgrammerHumor': 460.36457357075915,
 'Python': 19.62010972209731,
 'Web_Development': 1.165680473372781,
 'Windows10': 14.500180336146578,
 'androiddev': 11.89121421520237,
 'androidthemes': 30.770867637745408,
 'arduino': 12.679683432260752,
 'artificial': 7.961448293826518,
 'aws': 6.961463223787168,
 'chrome': 5.607130672125554,
 'coding': 13.21768140116764,
 'compsci': 21.0,
 'computerscience': 5.429937355753379,
 'cpp': 21.44868469803631,
 'cscareerquestions': 9.30821314710794,
 'csharp': 11.75581084691424,
 'css': 5.632745364088648,
 'csshelp': 1.5042659605766402,
 'dataisbeautiful': 316.7866983762506,
 'datascience': 8.189430591737997,
 'django': 5.676748582230624,
 'dotnet': 10.152072072072071,
 'golang': 13.324040219378428,
 'hackernews': 4.136428956349919,
 'haskell': 24.44258453297623,
 'iOSBeta': 11.14088778

In [15]:
import numpy as np


filtered = []

for subreddit in data['subreddit'].unique():
    filtered.append(data.loc[(data['subreddit'] == subreddit) & (data['score'] >= means[subreddit])])

In [16]:
filtered_data = pd.concat(filtered)

In [17]:
filtered_data['subreddit'].value_counts()

Android                6807
linuxquestions         3893
cscareerquestions      3772
learnpython            3081
webdev                 2565
hackernews             2563
iOSBeta                2424
Windows10              2338
linux4noobs            2275
ProgrammerHumor        2194
networking             2174
androiddev             2026
linux                  2013
windows                2005
javascript             1880
learnprogramming       1813
softwaregore           1746
ios                    1737
java                   1672
androidthemes          1664
chrome                 1548
Python                 1474
aws                    1471
rust                   1466
web_design             1361
javahelp               1326
arduino                1205
iOSProgramming         1190
mac                    1150
csshelp                1101
datascience            1081
AskNetsec              1079
golang                 1058
raspberry_pi           1037
reactjs                1030
haskell             

In [18]:
filtered_data.shape

(83852, 3)

In [19]:
filtered_data.drop_duplicates().shape

(83744, 3)

In [20]:
filtered_data = filtered_data.drop_duplicates()

In [21]:
filtered_data.sample(20)

Unnamed: 0,title,score,subreddit
503735,300ms delay on iOS full screen web apps,6,Angular2
614696,Irl software gore,162,softwaregore
116324,"Samsung Galaxy S8+ (Plus, SM-G955F) Smartphone...",215,Android
421977,"npm monthly: Chai 4, ESLint 4, webpack 2.6 and...",14,javascript
63499,I bough the the cheapest One Hung Lo soldering...,38,arduino
37783,"Xiaomi Redmi Pro 2 leaked: Snapdragon 660, 6GB...",185,Android
449939,Boeing 787 In Flight Entertainment System Secu...,9,hackernews
508909,Our blind people targeted app is doing well......,308,androiddev
100041,A list of resources that helped me learn Lists...,64,learnpython
420917,Anyone have experience with Onenote in Linux?,6,linux4noobs


In [22]:
X = filtered_data['title']
y = filtered_data['subreddit']

#### Splitting data into train (60%), val (20%), and test (20%).

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=17)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=31)
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)
print(y_train.shape)
print(y_test.shape)
print(y_val.shape)

(50246,)
(16749,)
(16749,)
(50246,)
(16749,)
(16749,)


#Baseline
Simple baseline using tf-idf based approaches

In [24]:
from sklearn.preprocessing import LabelEncoder


label_encoder = LabelEncoder()
label_encoder.fit(data['subreddit'])

y_train = label_encoder.transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

In [25]:
len(label_encoder.classes_)

57

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words='english')
X_train_vectors = vectorizer.fit_transform(X_train)
X_val_vectors = vectorizer.transform(X_val)
X_test_vectors = vectorizer.transform(X_test)

In [27]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


def top_n_accuracy(y_true, probs, n=5):
    top_n_list = []
    for prob in probs:
        top_n_list.append(np.argsort(-prob)[:n])
    predictions = []
    for prediction, top_n in zip(y_true, top_n_list):
        predictions.append(int(prediction in top_n))
    return np.sum(predictions) / y_true.shape[0]

In [28]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_vectors, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [29]:
nb_predictions = nb.predict(X_val_vectors)
nb_probs = nb.predict_proba(X_val_vectors)

In [30]:
print('Top 1 accuracy:\n', top_n_accuracy(y_val, nb_probs, 1))
print('Top 5 accuracy:\n', top_n_accuracy(y_val, nb_probs, 5))
print(classification_report(y_val, nb_predictions))

Top 1 accuracy:
 0.343901128426
Top 5 accuracy:
 0.687444026509
             precision    recall  f1-score   support

          0       0.20      0.99      0.34      1363
          1       0.94      0.32      0.48       180
          2       0.50      0.00      0.01       225
          3       0.00      0.00      0.00       106
          4       0.00      0.00      0.00        67
          5       0.60      0.09      0.16       449
          6       0.67      0.05      0.10       301
          7       0.00      0.00      0.00        53
          8       0.55      0.37      0.44       452
          9       0.74      0.28      0.40       397
         10       0.97      0.40      0.57       326
         11       0.95      0.35      0.52       218
         12       0.94      0.20      0.33       171
         13       0.86      0.56      0.67       281
         14       0.84      0.26      0.40       330
         15       0.00      0.00      0.00        88
         16       0.00      0.00  

  'precision', 'predicted', average, warn_for)


In [31]:
from sklearn.svm import LinearSVC
svm = LinearSVC(penalty='l2', loss='squared_hinge', multi_class='ovr', max_iter=1000)
svm.fit(X_train_vectors, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [32]:
svm_predictions = svm.predict(X_val_vectors)

print(accuracy_score(y_val, svm_predictions))
print(classification_report(y_val, svm_predictions))

0.53704698788
             precision    recall  f1-score   support

          0       0.76      0.88      0.81      1363
          1       0.85      0.82      0.84       180
          2       0.47      0.41      0.44       225
          3       0.18      0.11      0.14       106
          4       0.32      0.36      0.34        67
          5       0.30      0.35      0.32       449
          6       0.46      0.34      0.39       301
          7       0.12      0.04      0.06        53
          8       0.49      0.50      0.50       452
          9       0.59      0.62      0.61       397
         10       0.66      0.65      0.66       326
         11       0.74      0.71      0.72       218
         12       0.69      0.75      0.72       171
         13       0.75      0.75      0.75       281
         14       0.68      0.70      0.69       330
         15       0.12      0.07      0.09        88
         16       0.20      0.09      0.13       106
         17       0.31      0.2

In [33]:
from gensim.models import Doc2Vec
from gensim import utils
from gensim.models.doc2vec import TaggedDocument


tagged_documents = []
tokens = None
for text, label in zip(X_train, y_train):
    text = text.lower()
    tokens = utils.simple_preprocess(text)
    tagged_documents.append(TaggedDocument(tokens, [label]))

In [34]:
tagged_documents[:5]

[TaggedDocument(words=['forward', 'incoming', 'sms', 'messages', 'to', 'email', 'with', 'node', 'js', 'sendgrid', 'and', 'twilio', 'functions'], tags=[44]),
 TaggedDocument(words=['how', 'are', 'the', 'players', 'in', 'the', 'augmented', 'reality', 'mlb', 'at', 'bat', 'app', 'tracked'], tags=[31]),
 TaggedDocument(words=['google', 'photos', 'give', 'and', 'get', 'the', 'photos', 'you', 'care', 'about'], tags=[0]),
 TaggedDocument(words=['public', 'chat', 'room', 'using', 'firebase'], tags=[4]),
 TaggedDocument(words=['didn', 'know', 'where', 'to', 'post', 'this', 'but', 'it', 'made', 'me', 'smile'], tags=[5])]

In [35]:
model = Doc2Vec(size=64, window=7, min_count=1, workers=4, iter=20)

In [36]:
model.build_vocab(tagged_documents)
model.train(tagged_documents, total_examples=model.corpus_count, epochs=model.iter)

8857891

In [37]:
X_train_vectors = X_train.map(lambda title: model.infer_vector(utils.simple_preprocess(title))).values

In [38]:
X_train_vectors = np.array(list(X_train_vectors), dtype=np.float)

In [39]:
X_train_vectors.shape

(50246, 64)

In [40]:
from sklearn.svm import LinearSVC
svm = LinearSVC(penalty='l2', loss='squared_hinge', multi_class='ovr', max_iter=1000)
svm.fit(X_train_vectors, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [41]:
X_val_vectors = X_val.map(lambda title: model.infer_vector(utils.simple_preprocess(title))).values
X_val_vectors = np.array(list(X_val_vectors), dtype=np.float)

In [42]:
svm_predictions = svm.predict(X_val_vectors)

In [43]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_val, svm_predictions))
print(classification_report(y_val, svm_predictions))

0.323422293868
             precision    recall  f1-score   support

          0       0.35      0.67      0.46      1363
          1       0.61      0.77      0.68       180
          2       0.30      0.03      0.06       225
          3       0.00      0.00      0.00       106
          4       0.00      0.00      0.00        67
          5       0.18      0.10      0.13       449
          6       0.29      0.24      0.26       301
          7       0.00      0.00      0.00        53
          8       0.23      0.24      0.24       452
          9       0.25      0.32      0.28       397
         10       0.32      0.58      0.42       326
         11       0.40      0.59      0.48       218
         12       0.47      0.55      0.51       171
         13       0.34      0.57      0.43       281
         14       0.35      0.35      0.35       330
         15       0.00      0.00      0.00        88
         16       0.00      0.00      0.00       106
         17       0.00      0.

  'precision', 'predicted', average, warn_for)


In [44]:
ENGLISH_STOP_WORDS = frozenset([
    "a", "about", "above", "across", "after", "afterwards", "again", "against",
    "all", "almost", "alone", "along", "already", "also", "although", "always",
    "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
    "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
    "around", "as", "at", "back", "be", "became", "because", "become",
    "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
    "below", "beside", "besides", "between", "beyond", "bill", "both",
    "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
    "could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
    "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
    "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
    "everything", "everywhere", "except", "few", "fifteen", "fifty", "fill",
    "find", "fire", "first", "five", "for", "former", "formerly", "forty",
    "found", "four", "from", "front", "full", "further", "get", "give", "go",
    "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
    "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
    "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
    "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
    "latterly", "least", "less", "ltd", "made", "many", "may", "me",
    "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
    "move", "much", "must", "my", "myself", "name", "namely", "neither",
    "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
    "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
    "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
    "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
    "please", "put", "rather", "re", "same", "see", "seem", "seemed",
    "seeming", "seems", "serious", "several", "she", "should", "show", "side",
    "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
    "something", "sometime", "sometimes", "somewhere", "still", "such",
    "system", "take", "ten", "than", "that", "the", "their", "them",
    "themselves", "then", "thence", "there", "thereafter", "thereby",
    "therefore", "therein", "thereupon", "these", "they", "thick", "thin",
    "third", "this", "those", "though", "three", "through", "throughout",
    "thru", "thus", "to", "together", "too", "top", "toward", "towards",
    "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us",
    "very", "via", "was", "we", "well", "were", "what", "whatever", "when",
    "whence", "whenever", "where", "whereafter", "whereas", "whereby",
    "wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
    "who", "whoever", "whole", "whom", "whose", "why", "will", "with",
    "within", "without", "would", "yet", "you", "your", "yours", "yourself",
"yourselves"])


def format_for_fastext(X, y, filename):
    prefix = '__label__'
    f = open(''.join(['data/', filename]), 'w')
    for title, label in zip(X, y):
        title = title.lower()
        tokens = utils.simple_preprocess(title)
        tokens = [token for token in tokens if token not in ENGLISH_STOP_WORDS]
        f.write(''.join([prefix, str(label), ' ', ' '.join(tokens), '\n']))
    f.close()
    
format_for_fastext(X_train, y_train, 'reddit_fasttext_train.txt')
format_for_fastext(X_val, y_val, 'reddit_fasttext_val.txt')
format_for_fastext(X_test, y_test, 'reddit_fasttext_test.txt')

In [45]:
def test_fasttext(y, X, classifier, n=1):
    match = []
    for true, string in zip(y, X):
        predictions = list(classifier.predict(string, n)[0])
        for i in range(n):
            predictions[i] = int(predictions[i].split('__label__')[1])
        match.append(int(true in predictions))
    return np.array(match)

In [46]:
import fastText as fasttext

classifier = fasttext.train_supervised(input='data/reddit_fasttext_train.txt',
                                 lr=0.1,
                                 epoch=30,
                                 dim=64,
                                 minn=2,
                                 maxn=5
                                )

correct = test_fasttext(y_val, X_val, classifier)
correct.sum() / y_val.size

0.35417039823273033

In [49]:
classifier.save_model('models/fasttext.bin')

In [50]:
from fastText import load_model

classifier = load_model('models/fasttext.bin')

In [51]:
correct = test_fasttext(y_val, X_val, classifier, 10)
correct.sum() / y_val.size

0.78822616275598545

In [52]:
correct = test_fasttext(y_val, X_val, classifier, 5)
correct.sum() / y_val.size

0.66738312735088667