In [111]:
import pandas as pd
data = pd.read_csv('data/reddit_posts.csv')  # unzip this file

In [112]:
len(data['subreddit'].unique())

136

In [113]:
data.shape

(624289, 3)

In [114]:
data['subreddit'].value_counts()

Android                61202
learnprogramming       35288
cscareerquestions      32935
Windows10              27726
webdev                 26849
dataisbeautiful        24389
softwaregore           23746
web_design             22159
ProgrammerHumor        19208
learnpython            17634
raspberry_pi           15659
iOSBeta                14508
linux                  14058
javascript             12971
linuxquestions         11464
hackernews             11134
Python                 11119
windows                10132
androiddev             10130
mac                     9841
ios                     9754
arduino                 9603
java                    9401
networking              9378
linux4noobs             8004
androidthemes           7895
chrome                  5862
iOSProgramming          5647
rust                    5489
datascience             5374
                       ...  
redis                    241
dartlang                 240
programmerreactions      237
Julia         

#### We are filtering non-latin workds, and also subreddits that have less than 800 posts. 

In [115]:
counts = data['subreddit'].value_counts()
counts = counts[counts > 800]
top_values = list(counts.index)
data = data[~data['subreddit'].isin(top_values)]

In [205]:
import unicodedata as ud
latin_letters= {}
def is_latin(uchr):
    try: return latin_letters[uchr]
    except KeyError:
         return latin_letters.setdefault(uchr, 'LATIN' in ud.name(uchr))

def only_roman_chars(unistr):
    return all(is_latin(uchr)
           for uchr in unistr
           if uchr.isalpha()) # isalpha suggested by John Machin

In [214]:
data['is_latin'] = data['subreddit'].apply(only_roman_chars)

In [218]:
data = data[data['is_latin'] == True]

In [219]:
data.dropna().shape

(13949, 4)

In [220]:
data.drop_duplicates().shape

(13949, 4)

In [221]:
data = data.dropna().drop_duplicates()

In [222]:
data.shape

(13949, 4)

In [223]:
data.sample(20)

Unnamed: 0,subreddit,title,score,is_latin
186512,zsh,The way I see it,1,True
360873,symfony,Symfony 3.4.0-BETA1 released,1,True
345529,PostgreSQL,amcheck: Functions for verifying PostgreSQL re...,7,True
120727,windowsinsiders,Chrome/Chromium based web-browsers issues with...,1,True
33036,browsers,Best incognito mode hint ever,7,True
68522,Meteor,Newbie questions,4,True
418401,PostgreSQL,PGConf 2018: Call for papers now open!,7,True
368014,windowsinsiders,"An update is being prepared your device, but i...",5,True
269696,rubyonrails,Learning Ruby: From Zero to Hero. A Complete G...,1,True
223997,nginx,NGINX Plus R12 Now Available,8,True


In [224]:
X = data['title']
y = data['subreddit']

#### Splitting data into train (60%), val (20%), and test (20%).

In [225]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=17)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=31)
print(X_train.shape)
print(X_test.shape)
print(X_val.shape)
print(y_train.shape)
print(y_test.shape)
print(y_val.shape)

(8369,)
(2790,)
(2790,)
(8369,)
(2790,)
(2790,)


#Baseline
Simple baseline using tf-idf based approaches

In [226]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(data['subreddit'])
y_train = label_encoder.transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

In [227]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 4), stop_words='english')
X_train_vectors = vectorizer.fit_transform(X_train)
X_val_vectors = vectorizer.transform(X_val)
X_test_vectors = vectorizer.transform(X_test)

In [228]:
import numpy as np
def top_n_accuracy(y_true, probs, n=5):
    top_n_list = []
    for prob in probs:
        top_n_list.append(np.argsort(-prob)[:n])
    predictions = []
    for prediction, top_n in zip(y_true, top_n_list):
        predictions.append(int(prediction in top_n))
    return np.sum(predictions) / y_true.shape[0]

In [229]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(X_train_vectors, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [230]:
nb_predictions = nb.predict(X_val_vectors)
nb_probs = nb.predict_proba(X_val_vectors)

In [231]:
print('Top 1 accuracy:\n', top_n_accuracy(y_val, nb_probs, 1))
print('Top 5 accuracy:\n', top_n_accuracy(y_val, nb_probs, 5))

Top 1 accuracy:
 0.467025089606
Top 5 accuracy:
 0.654838709677


In [232]:
from sklearn.svm import LinearSVC
svm = LinearSVC(penalty='l2', loss='squared_hinge', multi_class='ovr', max_iter=1000)
svm.fit(X_train_vectors, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [233]:
svm_predictions = svm.predict(X_val_vectors)
from sklearn.metrics import accuracy_score
accuracy_score(y_val, svm_predictions)

0.66129032258064513

#### We load a pre-trained google model using gensim. 

In [28]:
import gensim

In [79]:
model = gensim.models.KeyedVectors.load_word2vec_format('Downloads/GoogleNews-vectors-negative300.bin', binary=True) 

In [80]:
w2v = dict(zip(model.wv.index2word, model.wv.syn0))

In [234]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.values())

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [235]:
vectorizer = MeanEmbeddingVectorizer(w2v)

In [258]:
X_train_vectors = vectorizer.transform(X_train)
X_val_vectors = vectorizer.transform(X_val)
X_test_vectors = vectorizer.transform(X_test)

In [295]:
from sklearn.svm import LinearSVC
svm = LinearSVC(penalty='l2', loss='squared_hinge', multi_class='ovr', max_iter=1000)
svm.fit(X_train_vectors, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

#### There are still some rows that contain abnormal vectors. 

In [296]:
df = pd.DataFrame()
df['x_Val'] = X_val_vectors
df['y_Val'] = y_val

In [297]:
def _is_300(x):
    if x.shape[0] == 300:
        return True
    else: 
        return False

In [298]:
df['is_300'] = df['x_Val'].apply(_is_300)
df = df[df['is_300'] == True]
Y_Val = np.array(list(df.y_Val))
X_Val = np.array(list(df.x_Val))
svm_predictions = svm.predict(X_Val)
from sklearn.metrics import accuracy_score
accuracy_score(Y_Val, svm_predictions)

0.33464849354375897