In [29]:
import pandas as pd
data = pd.read_csv('data/cs_subs.csv')  # unzip this file

In [30]:
len(data['subreddit'].unique())

136

In [31]:
data.shape

(624289, 3)

In [32]:
data['subreddit'].value_counts()

Android                61202
learnprogramming       35288
cscareerquestions      32935
Windows10              27726
webdev                 26849
dataisbeautiful        24389
softwaregore           23746
web_design             22159
ProgrammerHumor        19208
learnpython            17634
raspberry_pi           15659
iOSBeta                14508
linux                  14058
javascript             12971
linuxquestions         11464
hackernews             11134
Python                 11119
windows                10132
androiddev             10130
mac                     9841
ios                     9754
arduino                 9603
java                    9401
networking              9378
linux4noobs             8004
androidthemes           7895
chrome                  5862
iOSProgramming          5647
rust                    5489
datascience             5374
                       ...  
redis                    241
dartlang                 240
programmerreactions      237
Julia         

In [33]:
data.sample(10)

Unnamed: 0,title,score,subreddit
119376,What phone should I get?,1,Android
378136,NATIVE sounds,0,raspberry_pi
542046,African Americans as percentage of local popul...,8,dataisbeautiful
43311,How does a succesful data science team look like?,0,datascience
332620,Wall Street Furious Over BitCoin Millionaires,1,dataisbeautiful
7243,Must it or mustn't it?!,6,softwaregore
466066,2016 American Community Survey (ACS) Data Map,1,dataisbeautiful
213322,[Co-ops &amp; Internships] Help? I chose a Uni...,0,cscareerquestions
324399,Google is now offering a Mobile Sites certific...,1,Web_Development
368673,DevOps: How to Give Your Business Velocity,1,node


#### We are filtering non-latin workds, and also subreddits that have less than 800 posts. 

In [34]:
counts = data['subreddit'].value_counts()
counts = counts[counts > 800]
top_values = list(counts.index)
data = data[~data['subreddit'].isin(top_values)]

In [35]:
import unicodedata as ud

latin_letters = {}


def is_latin(uchr):
    try:
        return latin_letters[uchr]
    except KeyError:
        return latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr))


def only_roman_chars(unistr):
    return all(is_latin(uchr)
               for uchr in unistr
               if uchr.isalpha())  # isalpha suggested by John Machin


In [36]:
data['is_latin'] = data['subreddit'].apply(only_roman_chars)

In [37]:
data = data[data['is_latin'] == True]

In [38]:
data.dropna().shape

(14019, 4)

In [39]:
data.drop_duplicates().shape

(13949, 4)

In [40]:
data = data.dropna().drop_duplicates()

In [41]:
data.shape

(13949, 4)

In [42]:
data.sample(20)

Unnamed: 0,title,score,subreddit,is_latin
99131,Look who’s here! Long time no see!,0,nginx,True
243348,How to search good places to travel (mongoimpo...,1,mongodb,True
157617,"Looking for contributors for ruby gem project,...",6,rubyonrails,True
588946,How advanced are plagiarism detection algorith...,5,LanguageTechnology,True
46747,Free аnd well trustеd Intеrnеt dаting websitе ...,0,learnruby,True
75880,A Rubyist's Guide to Postgresql's Explain,1,PostgreSQL,True
5407,Surface Book and build 15019 problems,5,windowsinsiders,True
522027,Building a Database from Scratch - Benoit Ches...,11,erlang,True
149451,How to Remove Topgamesnetwork.com Totally?,1,browsers,True
351150,"Code for ""Labeling the Semantic Roles of Comma...",4,LanguageTechnology,True


In [43]:
X = data['title']
y = data['subreddit']

#### Splitting data into train (60%), val (20%), and test (20%).

In [44]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=17)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=31)
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)
print(y_train.shape)
print(y_test.shape)
print(y_val.shape)

(8369,)
(2790,)
(2790,)
(8369,)
(2790,)
(2790,)


#Baseline
Simple baseline using tf-idf based approaches

In [45]:
from sklearn.preprocessing import LabelEncoder


label_encoder = LabelEncoder()
label_encoder.fit(data['subreddit'])
y_train = label_encoder.transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

In [46]:
len(label_encoder.classes_)

51

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words='english')
X_train_vectors = vectorizer.fit_transform(X_train)
X_val_vectors = vectorizer.transform(X_val)
X_test_vectors = vectorizer.transform(X_test)

In [48]:
import numpy as np


def top_n_accuracy(y_true, probs, n=5):
    top_n_list = []
    for prob in probs:
        top_n_list.append(np.argsort(-prob)[:n])
    predictions = []
    for prediction, top_n in zip(y_true, top_n_list):
        predictions.append(int(prediction in top_n))
    return np.sum(predictions) / y_true.shape[0]

In [49]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_vectors, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [50]:
nb_predictions = nb.predict(X_val_vectors)
nb_probs = nb.predict_proba(X_val_vectors)

In [51]:
print('Top 1 accuracy:\n', top_n_accuracy(y_val, nb_probs, 1))
print('Top 5 accuracy:\n', top_n_accuracy(y_val, nb_probs, 5))

Top 1 accuracy:
 0.505734767025
Top 5 accuracy:
 0.688888888889


In [52]:
from sklearn.svm import LinearSVC
svm = LinearSVC(penalty='l2', loss='squared_hinge', multi_class='ovr', max_iter=1000)
svm.fit(X_train_vectors, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [53]:
svm_predictions = svm.predict(X_val_vectors)

from sklearn.metrics import accuracy_score

accuracy_score(y_val, svm_predictions)

0.65161290322580645

In [181]:
from gensim.models import Doc2Vec
from gensim import utils
from gensim.models.doc2vec import TaggedDocument


tagged_documents = []
tokens = None
for text, label in zip(X_train, y_train):
    text = text.lower()
    tokens = utils.simple_preprocess(text)    tagged_documents.append(TaggedDocument(tokens, [label]))

In [182]:
tagged_documents[:5]

[TaggedDocument(words=['going', 'spam', 'links', 'zsh'], tags=[50]),
 TaggedDocument(words=['citus', 'works'], tags=[9]),
 TaggedDocument(words=['framework', 'building', 'websites', 'express', 'handlebars', 'mysql'], tags=[35]),
 TaggedDocument(words=['just', 'mempty', 'gt'], tags=[23]),
 TaggedDocument(words=['paul', 'vixie', 'dns', 'fame', 'keynote', 'postgresopen', 'sv'], tags=[9])]

In [183]:
model = Doc2Vec(size=64, window=7, min_count=1, workers=4, iter=20)

In [184]:
model.build_vocab(tagged_documents)
model.train(tagged_documents, total_examples=model.corpus_count, epochs=model.iter)

1046423

In [185]:
X_train_vectors = X_train.map(lambda title: model.infer_vector(utils.simple_preprocess(title))).values

In [186]:
X_train_vectors = np.array(list(X_train_vectors), dtype=np.float)

In [187]:
X_train_vectors.shape

(8369, 64)

In [188]:
from sklearn.svm import LinearSVC
svm = LinearSVC(penalty='l2', loss='squared_hinge', multi_class='ovr', max_iter=1000)
svm.fit(X_train_vectors, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [189]:
X_val_vectors = X_val.map(lambda title: model.infer_vector(utils.simple_preprocess(title))).values
X_val_vectors = np.array(list(X_val_vectors), dtype=np.float)

#### There are still some rows that contain abnormal vectors. 

In [190]:
svm_predictions = svm.predict(X_val_vectors)

In [191]:
from sklearn.metrics import accuracy_score

accuracy_score(y_val, svm_predictions)

0.49713261648745521

In [98]:
ENGLISH_STOP_WORDS = frozenset([
    "a", "about", "above", "across", "after", "afterwards", "again", "against",
    "all", "almost", "alone", "along", "already", "also", "although", "always",
    "am", "among", "amongst", "amoungst", "amount", "an", "and", "another",
    "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are",
    "around", "as", "at", "back", "be", "became", "because", "become",
    "becomes", "becoming", "been", "before", "beforehand", "behind", "being",
    "below", "beside", "besides", "between", "beyond", "bill", "both",
    "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con",
    "could", "couldnt", "cry", "de", "describe", "detail", "do", "done",
    "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else",
    "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone",
    "everything", "everywhere", "except", "few", "fifteen", "fifty", "fill",
    "find", "fire", "first", "five", "for", "former", "formerly", "forty",
    "found", "four", "from", "front", "full", "further", "get", "give", "go",
    "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter",
    "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his",
    "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed",
    "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter",
    "latterly", "least", "less", "ltd", "made", "many", "may", "me",
    "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly",
    "move", "much", "must", "my", "myself", "name", "namely", "neither",
    "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone",
    "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on",
    "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our",
    "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
    "please", "put", "rather", "re", "same", "see", "seem", "seemed",
    "seeming", "seems", "serious", "several", "she", "should", "show", "side",
    "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone",
    "something", "sometime", "sometimes", "somewhere", "still", "such",
    "system", "take", "ten", "than", "that", "the", "their", "them",
    "themselves", "then", "thence", "there", "thereafter", "thereby",
    "therefore", "therein", "thereupon", "these", "they", "thick", "thin",
    "third", "this", "those", "though", "three", "through", "throughout",
    "thru", "thus", "to", "together", "too", "top", "toward", "towards",
    "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us",
    "very", "via", "was", "we", "well", "were", "what", "whatever", "when",
    "whence", "whenever", "where", "whereafter", "whereas", "whereby",
    "wherein", "whereupon", "wherever", "whether", "which", "while", "whither",
    "who", "whoever", "whole", "whom", "whose", "why", "will", "with",
    "within", "without", "would", "yet", "you", "your", "yours", "yourself",
"yourselves"])


def format_for_fastext(X, y, filename):
    prefix = '__label__'
    f = open(''.join(['data/', filename]), 'w')
    for title, label in zip(X, y):
        title = title.lower()
        tokens = utils.simple_preprocess(title)
        tokens = [token for token in tokens if token not in ENGLISH_STOP_WORDS]
        f.write(''.join([prefix, str(label), ' ', ' '.join(tokens), '\n']))
    f.close()
    
format_for_fastext(X_train, y_train, 'reddit_fasttext_train.txt')
format_for_fastext(X_val, y_val, 'reddit_fasttext_val.txt')
format_for_fastext(X_test, y_test, 'reddit_fasttext_test.txt')

In [168]:
import fasttext

classifier = fasttext.supervised('data/reddit_fasttext_train.txt', 'model', 
                                 label_prefix='__label__', 
                                 lr=0.1,
                                 epoch=24,
                                 dim=64,
                                 minn=2
                                )
results = classifier.test('data/reddit_fasttext_val.txt')
print(results.precision)
print(results.recall)

0.6232974910394266
0.6232974910394266


In [169]:
results = classifier.test('data/reddit_fasttext_test.txt')
print(results.precision)
print(results.recall)

0.6229390681003584
0.6229390681003584


In [79]:
classifier.predict_proba('Function Currying in C (lame), Because Boredom')

[[('22', 0.521484)],
 [('29', 0.712891)],
 [('15', 0.259766)],
 [('12', 0.988281)],
 [('23', 0.900391)],
 [('50', 0.412109)],
 [('22', 0.521484)],
 [('15', 0.259766)],
 [('22', 0.521484)],
 [('22', 0.521484)],
 [('29', 0.712891)],
 [('22', 0.431641)],
 [('22', 0.431641)],
 [('30', 0.955078)],
 [('50', 0.412109)],
 [('15', 0.259766)],
 [('22', 0.521484)],
 [('22', 0.521484)],
 [('50', 0.412109)],
 [('15', 0.259766)],
 [('22', 0.521484)],
 [('22', 0.521484)],
 [('22', 0.521484)],
 [('22', 0.376953)],
 [('22', 0.521484)],
 [('25', 0.574219)],
 [('30', 0.623047)],
 [('22', 0.521484)],
 [('22', 0.429688)],
 [('34', 0.720703)],
 [('22', 0.521484)],
 [('22', 0.521484)],
 [('22', 0.521484)],
 [('12', 0.988281)],
 [('25', 0.574219)],
 [('29', 0.712891)],
 [('49', 0.3125)],
 [('22', 0.521484)],
 [('22', 0.521484)],
 [('22', 0.521484)],
 [('22', 0.521484)],
 [('22', 0.431641)],
 [('22', 0.521484)],
 [('22', 0.521484)],
 [('22', 0.521484)],
 [('30', 0.623047)]]