In [3]:
import pandas as pd

data = pd.read_csv('data/cs_subs.csv')  # unzip this file

In [4]:
len(data['subreddit'].unique())

136

In [5]:
data.shape

(624289, 3)

In [6]:
data['subreddit'].value_counts()

Android                61202
learnprogramming       35288
cscareerquestions      32935
Windows10              27726
webdev                 26849
dataisbeautiful        24389
softwaregore           23746
web_design             22159
ProgrammerHumor        19208
learnpython            17634
raspberry_pi           15659
iOSBeta                14508
linux                  14058
javascript             12971
linuxquestions         11464
hackernews             11134
Python                 11119
windows                10132
androiddev             10130
mac                     9841
ios                     9754
arduino                 9603
java                    9401
networking              9378
linux4noobs             8004
androidthemes           7895
chrome                  5862
iOSProgramming          5647
rust                    5489
datascience             5374
                       ...  
redis                    241
dartlang                 240
programmerreactions      237
Julia         

In [7]:
data.dropna().shape

(624281, 3)

In [8]:
data.drop_duplicates().shape

(615832, 3)

In [9]:
data = data.dropna().drop_duplicates()

In [10]:
data.shape

(615825, 3)

In [11]:
data.sample(20)

Unnamed: 0,title,score,subreddit
423502,Help with Rust,1,rust
72233,Beauty of the Perl 6 grammar (part 1),9,ProgrammerHumor
283158,How do I get interviews for junior positions w...,3,cscareerquestions
588513,SoftBank’s $100B fund is in a league of its own,2,hackernews
510035,Top Brochure Design India,1,web_design
306309,Google PLAY Support,1,androiddev
63554,Best Web Design &amp; Development Company,1,Web_Development
195653,Information Security and Tech Interview - What...,1,cscareerquestions
370878,Is this code affected by the Global Interprete...,1,Python
527098,Stock Options,0,cscareerquestions


In [12]:
X = data['title']
y = data['subreddit']

#### Splitting data into train (60%), val (20%), and test (20%).

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=17)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=31)

print(X_train.shape)
print(X_test.shape)
print(X_val.shape)
print(y_train.shape)
print(y_test.shape)
print(y_val.shape)

(369495,)
(123165,)
(123165,)
(369495,)
(123165,)
(123165,)


#Baseline
Simple baseline using tf-idf based approaches

In [14]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(data['subreddit'])
y_train = label_encoder.transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1, 4), stop_words='english')
X_train_vectors = vectorizer.fit_transform(X_train)
X_val_vectors = vectorizer.transform(X_val)
X_test_vectors = vectorizer.transform(X_test)

In [16]:
import numpy as np


def top_n_accuracy(y_true, probs, n=5):
    top_n_list = []
    for prob in probs:
        top_n_list.append(np.argsort(-prob)[:n])
    predictions = []
    for prediction, top_n in zip(y_true, top_n_list):
        predictions.append(int(prediction in top_n))
    return np.sum(predictions) / y_true.shape[0]

In [17]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train_vectors, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [18]:
nb_predictions = nb.predict(X_val_vectors)
nb_probs = nb.predict_proba(X_val_vectors)

In [24]:
print('Top 1 accuracy:\n', top_n_accuracy(y_val, nb_probs, 1))
print('Top 5 accuracy:\n', top_n_accuracy(y_val, nb_probs, 5))

Top 1 accuracy:
 0.323086915926


Top 5 accuracy:
 0.606389802298


In [20]:
from sklearn.svm import LinearSVC

svm = LinearSVC(penalty='l2', loss='squared_hinge', multi_class='ovr', max_iter=1000)
svm.fit(X_train_vectors, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [21]:
svm_predictions = svm.predict(X_val_vectors)

In [22]:
from sklearn.metrics import accuracy_score

accuracy_score(y_val, svm_predictions)

0.53853773393415338

[ 89 131   0  83  90]
[  0 124 131  77  89]
[  0  56  89  49 124]
[  0  89  49 131  25]
[131 130   0  89  49]
