In [3]:
import pandas as pd
data = pd.read_csv('data/cs_subs.csv')  # unzip this file

In [4]:
len(data['subreddit'].unique())

136

In [5]:
data.shape

(624289, 3)

In [6]:
data.dropna().shape

(624281, 3)

In [7]:
data = data.dropna()

In [8]:
data['subreddit'].value_counts()

Android                61202
learnprogramming       35288
cscareerquestions      32935
Windows10              27726
webdev                 26849
dataisbeautiful        24388
softwaregore           23741
web_design             22159
ProgrammerHumor        19206
learnpython            17634
raspberry_pi           15659
iOSBeta                14508
linux                  14058
javascript             12971
linuxquestions         11464
hackernews             11134
Python                 11119
windows                10132
androiddev             10130
mac                     9841
ios                     9754
arduino                 9603
java                    9401
networking              9378
linux4noobs             8004
androidthemes           7895
chrome                  5862
iOSProgramming          5647
rust                    5489
datascience             5374
                       ...  
redis                    241
dartlang                 240
programmerreactions      237
Julia         

In [9]:
data.sample(10)

Unnamed: 0,title,score,subreddit
423610,I found this article in an old magazine and ha...,1,ProgrammerHumor
176128,My Rust server,1,rust
350321,Free Club Music Player App,0,macapps
11928,"Anti-virus software is just a terrible idea, w...",1,windows
584377,Introducing the Windows Console Colortool,64,Windows10
445763,How to create custom number pad keyboard in an...,3,androiddev
156759,Discord: Stoked to announce our super sick app...,142,linux
237445,Not a shill but cracked.com is offering discou...,1,learnprogramming
294108,[Review] Samsung Galaxy Jet and its 800 MHz pr...,1,Android
252857,Getting a CS degree at.. late age (long and so...,0,cscareerquestions


#### We are filtering non-latin words, and also subreddits that have less than 150 posts. 

In [10]:
counts = data['subreddit'].value_counts()
counts = counts[counts > 150]

In [11]:
top_values = list(counts.index)
data = data[data['subreddit'].isin(top_values)]

In [12]:
data['subreddit'].value_counts()

Android                61202
learnprogramming       35288
cscareerquestions      32935
Windows10              27726
webdev                 26849
dataisbeautiful        24388
softwaregore           23741
web_design             22159
ProgrammerHumor        19206
learnpython            17634
raspberry_pi           15659
iOSBeta                14508
linux                  14058
javascript             12971
linuxquestions         11464
hackernews             11134
Python                 11119
windows                10132
androiddev             10130
mac                     9841
ios                     9754
arduino                 9603
java                    9401
networking              9378
linux4noobs             8004
androidthemes           7895
chrome                  5862
iOSProgramming          5647
rust                    5489
datascience             5374
                       ...  
windowsinsiders          683
jquery                   667
operabrowser             647
browsers      

In [13]:
data.shape

(622909, 3)

In [14]:
data['subreddit'].unique().shape

(117,)

We have a lot of data. Especially for my Macbook. Let's see the average reddit score (upvotes + downvotes) for each subreddit to filter out. I want to do mean and not median since median would just arbitrarily cut the data in half. Hopefully filtering by mean will take relatively larger chunks out of the more popular subreddits than the less popular ones.

In [15]:
means = {}
for subreddit in data['subreddit'].unique():
    means[subreddit] = data[data['subreddit'] == subreddit]['score'].mean()

In [16]:
means

{'Android': 75.0088722590765,
 'Angular2': 5.473166368515206,
 'AskComputerScience': 4.370666666666667,
 'AskNetsec': 6.923273657289003,
 'AutoHotkey': 1.8294736842105264,
 'C_Programming': 6.593274988484569,
 'Clojure': 17.862068965517242,
 'Database': 2.3162813575996064,
 'DatabaseHelp': 1.2147435897435896,
 'IOT': 1.9538461538461538,
 'Julia': 11.55299539170507,
 'LanguageTechnology': 5.160984848484849,
 'MLQuestions': 2.3911630929174787,
 'MaterialDesign': 10.139664804469273,
 'Meteor': 3.731012658227848,
 'OSXTweaks': 4.51010101010101,
 'PostgreSQL': 6.54421768707483,
 'ProgrammerHumor': 460.36457357075915,
 'Python': 19.62010972209731,
 'SQLServer': 3.208778173190985,
 'UI_Design': 4.922300706357215,
 'Web_Development': 1.165680473372781,
 'Windows10': 14.500180336146578,
 'androiddev': 11.89121421520237,
 'androidthemes': 30.770867637745408,
 'angularjs': 5.399414776883687,
 'arduino': 12.679683432260752,
 'artificial': 7.961448293826518,
 'aws': 6.961463223787168,
 'bash': 4.61

In [17]:
import numpy as np


filtered = []

for subreddit in data['subreddit'].unique():
    filtered.append(data.loc[(data['subreddit'] == subreddit) & (data['score'] >= means[subreddit])])

In [18]:
filtered_data = pd.concat(filtered)

In [19]:
filtered_data['subreddit'].value_counts()

Android                6807
linuxquestions         3893
cscareerquestions      3772
learnpython            3081
webdev                 2565
hackernews             2563
iOSBeta                2424
Windows10              2338
linux4noobs            2275
ProgrammerHumor        2194
networking             2174
androiddev             2026
linux                  2013
windows                2005
javascript             1880
learnprogramming       1813
softwaregore           1746
ios                    1737
java                   1672
androidthemes          1664
chrome                 1548
Python                 1474
aws                    1471
rust                   1466
web_design             1361
javahelp               1326
arduino                1205
iOSProgramming         1190
mac                    1150
csshelp                1101
                       ... 
operabrowser            221
mongodb                 197
windowsinsiders         189
macapps                 189
LanguageTechnology  

In [20]:
filtered_data.shape

(99057, 3)

In [21]:
filtered_data.drop_duplicates().shape

(98941, 3)

In [22]:
filtered_data = filtered_data.drop_duplicates()

In [23]:
filtered_data.sample(20)

Unnamed: 0,title,score,subreddit
192268,SQL Server Tuning advices,9,SQLServer
243232,Final solution to telephone number input problem,1195,ProgrammerHumor
440672,Network config network generator,14,networking
346799,"Nuances of Null – Using IsNull, Coalesce, Conc...",9,SQLServer
202502,A Visual Lexicon of LINQ - Simple Talk,13,dotnet
470515,1 KB JavaScript library for building frontend ...,20,coolgithubprojects
258719,ScalaJSON first milestone release on scala-pla...,15,scala
546569,Thoughts on alternatives ORMs,16,csharp
392089,The TODOs app is the new “Hello World”,64,reactjs
544313,US Condominium Average Monthly Rental Prices i...,6574,dataisbeautiful


In [24]:
X = filtered_data['title']
y = filtered_data['subreddit']

#### Splitting data into train (60%), val (20%), and test (20%).

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=17)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=31)
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)
print(y_train.shape)
print(y_test.shape)
print(y_val.shape)

(59364,)
(19788,)
(19789,)
(59364,)
(19788,)
(19789,)


#Baseline
Simple baseline using tf-idf based approaches

In [26]:
from sklearn.preprocessing import LabelEncoder


label_encoder = LabelEncoder()
label_encoder.fit(data['subreddit'])
y_train = label_encoder.transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

In [112]:
len(label_encoder.classes_)

117

In [127]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words='english')
X_train_vectors = vectorizer.fit_transform(X_train)
X_val_vectors = vectorizer.transform(X_val)
X_test_vectors = vectorizer.transform(X_test)

In [128]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


def top_n_accuracy(y_true, probs, n=5):
    top_n_list = []
    for prob in probs:
        top_n_list.append(np.argsort(-prob)[:n])
    predictions = []
    for prediction, top_n in zip(y_true, top_n_list):
        predictions.append(int(prediction in top_n))
    return np.sum(predictions) / y_true.shape[0]

In [129]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_vectors, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [130]:
nb_predictions = nb.predict(X_val_vectors)
nb_probs = nb.predict_proba(X_val_vectors)

In [131]:
print('Top 1 accuracy:\n', top_n_accuracy(y_val, nb_probs, 1))
print('Top 5 accuracy:\n', top_n_accuracy(y_val, nb_probs, 5))
print(classification_report(y_val, nb_predictions))

Top 1 accuracy:
 0.288392541311
Top 5 accuracy:
 0.592298751832
             precision    recall  f1-score   support

          0       0.17      0.99      0.28      1362
          1       0.73      0.32      0.44       180
          2       0.00      0.00      0.00       105
          3       0.56      0.02      0.04       216
          4       1.00      0.02      0.03       125
          5       0.00      0.00      0.00        91
          6       1.00      0.10      0.18       101
          7       0.00      0.00      0.00        69
          8       0.00      0.00      0.00        18
          9       1.00      0.01      0.02        89
         10       0.00      0.00      0.00        18
         11       0.00      0.00      0.00        38
         12       0.00      0.00      0.00       115
         13       0.00      0.00      0.00        13
         14       0.00      0.00      0.00        19
         15       0.00      0.00      0.00        20
         16       0.00      0.00  

  'precision', 'predicted', average, warn_for)


In [132]:
from sklearn.svm import LinearSVC
svm = LinearSVC(penalty='l2', loss='squared_hinge', multi_class='ovr', max_iter=1000)
svm.fit(X_train_vectors, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [133]:
svm_predictions = svm.predict(X_val_vectors)

print(accuracy_score(y_val, svm_predictions))
print(classification_report(y_val, svm_predictions))

0.501288594674
             precision    recall  f1-score   support

          0       0.74      0.87      0.80      1362
          1       0.64      0.69      0.67       180
          2       0.18      0.10      0.13       105
          3       0.36      0.31      0.33       216
          4       0.47      0.50      0.48       125
          5       0.19      0.15      0.17        91
          6       0.81      0.70      0.75       101
          7       0.25      0.20      0.22        69
          8       0.50      0.06      0.10        18
          9       0.65      0.62      0.64        89
         10       0.88      0.78      0.82        18
         11       0.45      0.37      0.41        38
         12       0.61      0.46      0.52       115
         13       0.60      0.23      0.33        13
         14       0.88      0.74      0.80        19
         15       0.33      0.05      0.09        20
         16       0.65      0.60      0.62        52
         17       0.23      0.

In [134]:
from gensim.models import Doc2Vec
from gensim import utils
from gensim.models.doc2vec import TaggedDocument


tagged_documents = []
tokens = None
for text, label in zip(X_train, y_train):
    text = text.lower()
    tokens = utils.simple_preprocess(text)
    tagged_documents.append(TaggedDocument(tokens, [label]))

In [135]:
tagged_documents[:5]

[TaggedDocument(words=['lessons', 'learned', 'from', 'my', 'latest', 'game', 'iteration', 'from', 'stars', 'to', 'over'], tags=[71]),
 TaggedDocument(words=['little', 'asp', 'net', 'core', 'book'], tags=[112]),
 TaggedDocument(words=['responsive', 'background', 'images', 'with', 'javascript'], tags=[111]),
 TaggedDocument(words=['can', 'ssh', 'to', 'virtual', 'machine', 'when', 'ipv', 'is', 'set', 'to', 'automatic', 'dhcp', 'but', 'can', 'ssh', 'when', 'use', 'manual', 'and', 'input', 'my', 'own', 'addresses'], tags=[82]),
 TaggedDocument(words=['conversational', 'dataset'], tags=[51])]

In [136]:
model = Doc2Vec(size=64, window=7, min_count=1, workers=4, iter=20)

In [137]:
model.build_vocab(tagged_documents)
model.train(tagged_documents, total_examples=model.corpus_count, epochs=model.iter)

10406315

In [138]:
X_train_vectors = X_train.map(lambda title: model.infer_vector(utils.simple_preprocess(title))).values

In [139]:
X_train_vectors = np.array(list(X_train_vectors), dtype=np.float)

In [140]:
X_train_vectors.shape

(59364, 64)

In [141]:
from sklearn.svm import LinearSVC
svm = LinearSVC(penalty='l2', loss='squared_hinge', multi_class='ovr', max_iter=1000)
svm.fit(X_train_vectors, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [142]:
X_val_vectors = X_val.map(lambda title: model.infer_vector(utils.simple_preprocess(title))).values
X_val_vectors = np.array(list(X_val_vectors), dtype=np.float)

In [143]:
svm_predictions = svm.predict(X_val_vectors)

In [144]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_val, svm_predictions))
print(classification_report(y_val, svm_predictions))

0.294557582495
             precision    recall  f1-score   support

          0       0.32      0.67      0.44      1362
          1       0.50      0.73      0.59       180
          2       0.00      0.00      0.00       105
          3       0.29      0.07      0.12       216
          4       0.67      0.03      0.06       125
          5       0.00      0.00      0.00        91
          6       0.54      0.50      0.52       101
          7       0.00      0.00      0.00        69
          8       0.00      0.00      0.00        18
          9       0.35      0.30      0.33        89
         10       0.67      0.11      0.19        18
         11       0.00      0.00      0.00        38
         12       0.45      0.09      0.15       115
         13       0.00      0.00      0.00        13
         14       0.00      0.00      0.00        19
         15       0.00      0.00      0.00        20
         16       0.50      0.02      0.04        52
         17       0.10      0.

  'precision', 'predicted', average, warn_for)


In [145]:
ENGLISH_STOP_WORDS = frozenset([
    "a", "about", "above", "across", "after", "afterwards", "again", "against",
    "all", "almost", "alone", "along", "already", "also", "although", "always",
    "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
    "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
    "around", "as", "at", "back", "be", "became", "because", "become",
    "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
    "below", "beside", "besides", "between", "beyond", "bill", "both",
    "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
    "could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
    "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
    "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
    "everything", "everywhere", "except", "few", "fifteen", "fifty", "fill",
    "find", "fire", "first", "five", "for", "former", "formerly", "forty",
    "found", "four", "from", "front", "full", "further", "get", "give", "go",
    "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
    "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
    "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
    "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
    "latterly", "least", "less", "ltd", "made", "many", "may", "me",
    "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
    "move", "much", "must", "my", "myself", "name", "namely", "neither",
    "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
    "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
    "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
    "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
    "please", "put", "rather", "re", "same", "see", "seem", "seemed",
    "seeming", "seems", "serious", "several", "she", "should", "show", "side",
    "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
    "something", "sometime", "sometimes", "somewhere", "still", "such",
    "system", "take", "ten", "than", "that", "the", "their", "them",
    "themselves", "then", "thence", "there", "thereafter", "thereby",
    "therefore", "therein", "thereupon", "these", "they", "thick", "thin",
    "third", "this", "those", "though", "three", "through", "throughout",
    "thru", "thus", "to", "together", "too", "top", "toward", "towards",
    "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us",
    "very", "via", "was", "we", "well", "were", "what", "whatever", "when",
    "whence", "whenever", "where", "whereafter", "whereas", "whereby",
    "wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
    "who", "whoever", "whole", "whom", "whose", "why", "will", "with",
    "within", "without", "would", "yet", "you", "your", "yours", "yourself",
"yourselves"])


def format_for_fastext(X, y, filename):
    prefix = '__label__'
    f = open(''.join(['data/', filename]), 'w')
    for title, label in zip(X, y):
        title = title.lower()
        tokens = utils.simple_preprocess(title)
        tokens = [token for token in tokens if token not in ENGLISH_STOP_WORDS]
        f.write(''.join([prefix, str(label), ' ', ' '.join(tokens), '\n']))
    f.close()
    
format_for_fastext(X_train, y_train, 'reddit_fasttext_train.txt')
format_for_fastext(X_val, y_val, 'reddit_fasttext_val.txt')
format_for_fastext(X_test, y_test, 'reddit_fasttext_test.txt')

In [116]:
classifier.predict(X_val.iloc[2], 5)

(('__label__24', '__label__17', '__label__107', '__label__68', '__label__63'),
 array([ 0.12639114,  0.06488035,  0.05542552,  0.03495135,  0.03151938]))

In [128]:
def test_fasttext(y, X, classifier, n = 1):
    match = []
    for true, string in zip(y, X):
        predictions = list(classifier.predict(string, n)[0])
        for i in range(n):
            predictions[i] = int(predictions[i].split('__label__')[1])
        match.append(int(true in predictions))
    return np.array(match)

In [92]:
import fastText as fasttext

classifier = fasttext.train_supervised(input='data/reddit_fasttext_train.txt',
                                 lr=0.1,
                                 epoch=30,
                                 dim=84,
                                 minn=2,
                                 maxn=5
                                )

correct = test_fasttext(y_val, X_val, classifier)
correct.sum() / y_val.size

TypeError: int() argument must be a string, a bytes-like object or a number, not 'list'

In [93]:
# classifier.save_model('models/fasttext.bin')

In [88]:
from fastText import load_model

classifier = load_model('models/fasttext.bin')

In [132]:
correct = test_fasttext(y_val, X_val, classifier, 10)
correct.sum() / y_val.size

0.72782859164182123