In [283]:
import pandas as pd
data = pd.read_csv('data/cs_subs.csv')  # unzip this file

In [284]:
len(data['subreddit'].unique())

136

In [285]:
data.shape

(624289, 3)

In [286]:
data['subreddit'].value_counts()

Android                61202
learnprogramming       35288
cscareerquestions      32935
Windows10              27726
webdev                 26849
dataisbeautiful        24389
softwaregore           23746
web_design             22159
ProgrammerHumor        19208
learnpython            17634
raspberry_pi           15659
iOSBeta                14508
linux                  14058
javascript             12971
linuxquestions         11464
hackernews             11134
Python                 11119
windows                10132
androiddev             10130
mac                     9841
ios                     9754
arduino                 9603
java                    9401
networking              9378
linux4noobs             8004
androidthemes           7895
chrome                  5862
iOSProgramming          5647
rust                    5489
datascience             5374
                       ...  
redis                    241
dartlang                 240
programmerreactions      237
Julia         

In [287]:
data.sample(10)

Unnamed: 0,title,score,subreddit
25254,Is my java textbook purposefully trying to con...,5,learnjava
138229,Laravel own variable in scope?,1,laravel
54398,Upcycling Old Pocket PC,3,raspberry_pi
50538,Public Cloud Provider IPv6 Test Sites,8,networking
581533,[help] What's wrong with my chrome screen?,14,chrome
458433,Is there ever any advantage to using static fu...,0,learnprogramming
247345,Can't get past Web Developer coding challenges,2,webdev
623766,"Online store of phones, smartphones and tablets",0,coolgithubprojects
314451,Which standard to use?,28,cpp
114680,Timelapse advice,3,raspberry_pi


#### We are filtering non-latin workds, and also subreddits that have less than 800 posts. 

In [288]:
counts = data['subreddit'].value_counts()
counts = counts[counts > 800]
top_values = list(counts.index)
data = data[~data['subreddit'].isin(top_values)]

In [289]:
import unicodedata as ud

latin_letters = {}


def is_latin(uchr):
    try:
        return latin_letters[uchr]
    except KeyError:
        return latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr))


def only_roman_chars(unistr):
    return all(is_latin(uchr)
               for uchr in unistr
               if uchr.isalpha())  # isalpha suggested by John Machin


In [290]:
data['is_latin'] = data['subreddit'].apply(only_roman_chars)

In [291]:
data = data[data['is_latin'] == True]

In [292]:
data.dropna().shape

(14019, 4)

In [293]:
data.drop_duplicates().shape

(13949, 4)

In [294]:
data = data.dropna().drop_duplicates()

In [295]:
data.shape

(13949, 4)

In [296]:
data.sample(20)

Unnamed: 0,title,score,subreddit,is_latin
304716,Mejores navegadores 2017,1,browsers,True
244053,Best way to go about counting the date/time-stamp,2,PostgreSQL,True
317994,Authenticate with SQL and data extract with Ca...,1,DatabaseHelp,True
41444,I made an Ubuntu Inspired MD Wallpaper,3,MaterialDesign,True
331505,Java Redis Mock - Mock Library For Testing Aga...,5,redis,True
29942,inconstant file format,0,Julia,True
182800,int4 vs int8 vs uuid vs numeric performance on...,7,PostgreSQL,True
157090,A Dart REPL (read-eval-print loop) proof-of-co...,11,dartlang,True
122751,Backup all photos and videos from single group...,1,chrome_extensions,True
564627,"[Payday 2] Yeah, pretty sure that's how hackin...",526,itsaunixsystem,True


In [297]:
X = data['title']
y = data['subreddit']

#### Splitting data into train (60%), val (20%), and test (20%).

In [298]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=17)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=31)
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)
print(y_train.shape)
print(y_test.shape)
print(y_val.shape)

(8369,)
(2790,)
(2790,)
(8369,)
(2790,)
(2790,)


#Baseline
Simple baseline using tf-idf based approaches

In [299]:
from sklearn.preprocessing import LabelEncoder


label_encoder = LabelEncoder()
label_encoder.fit(data['subreddit'])
y_train = label_encoder.transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

In [300]:
len(label_encoder.classes_)

51

In [301]:
from sklearn.feature_extraction.text import TfidfVectorizer


vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words='english')
X_train_vectors = vectorizer.fit_transform(X_train)
X_val_vectors = vectorizer.transform(X_val)
X_test_vectors = vectorizer.transform(X_test)

In [302]:
import numpy as np


def top_n_accuracy(y_true, probs, n=5):
    top_n_list = []
    for prob in probs:
        top_n_list.append(np.argsort(-prob)[:n])
    predictions = []
    for prediction, top_n in zip(y_true, top_n_list):
        predictions.append(int(prediction in top_n))
    return np.sum(predictions) / y_true.shape[0]

In [303]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_vectors, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [304]:
nb_predictions = nb.predict(X_val_vectors)
nb_probs = nb.predict_proba(X_val_vectors)

In [305]:
print('Top 1 accuracy:\n', top_n_accuracy(y_val, nb_probs, 1))
print('Top 5 accuracy:\n', top_n_accuracy(y_val, nb_probs, 5))

Top 1 accuracy:
 0.505734767025
Top 5 accuracy:
 0.688888888889


In [306]:
from sklearn.svm import LinearSVC
svm = LinearSVC(penalty='l2', loss='squared_hinge', multi_class='ovr', max_iter=1000)
svm.fit(X_train_vectors, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [307]:
svm_predictions = svm.predict(X_val_vectors)

from sklearn.metrics import accuracy_score

accuracy_score(y_val, svm_predictions)

0.65161290322580645

In [381]:
from gensim.models import Doc2Vec
from gensim import utils
from gensim.models.doc2vec import TaggedDocument


tagged_documents = []
tokens = None
for text, label in zip(X_train, y_train):
    tokens = utils.simple_preprocess(text)
    tagged_documents.append(TaggedDocument(tokens, [label]))

In [382]:
tagged_documents[:5]

[TaggedDocument(words=['what', 'is', 'going', 'on', 'with', 'spam', 'links', 'on', 'zsh'], tags=[50]),
 TaggedDocument(words=['how', 'citus', 'works'], tags=[9]),
 TaggedDocument(words=['framework', 'for', 'building', 'websites', 'express', 'handlebars', 'mysql'], tags=[35]),
 TaggedDocument(words=['just', 'mempty', 'gt', 'nothing'], tags=[23]),
 TaggedDocument(words=['paul', 'vixie', 'of', 'dns', 'fame', 'to', 'keynote', 'postgresopen', 'sv'], tags=[9])]

In [383]:
model = Doc2Vec(size=64, window=7, min_count=1, workers=4, iter=20)

In [384]:
model.build_vocab(tagged_documents)
model.train(tagged_documents, total_examples=model.corpus_count, epochs=model.iter)

1316877

In [385]:
X_train_vectors = X_train.map(lambda title: model.infer_vector(utils.simple_preprocess(title))).values

In [386]:
X_train_vectors = np.array(list(X_train_vectors), dtype=np.float)

In [387]:
X_train_vectors.shape

(8369, 64)

In [388]:
from sklearn.svm import LinearSVC
svm = LinearSVC(penalty='l2', loss='squared_hinge', multi_class='ovr', max_iter=1000)
svm.fit(X_train_vectors, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [389]:
X_val_vectors = X_val.map(lambda title: model.infer_vector(utils.simple_preprocess(title))).values
X_val_vectors = np.array(list(X_val_vectors), dtype=np.float)

#### There are still some rows that contain abnormal vectors. 

In [390]:
svm_predictions = svm.predict(X_val_vectors)

In [391]:
from sklearn.metrics import accuracy_score

accuracy_score(y_val, svm_predictions)

0.52222222222222225

In [392]:
import fasttext

In [None]:
def 