In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

# seaborn plot styles

sns.set_style("dark")
sns.set_palette("deep")
plt.rcParams["axes.labelsize"] = 15
plt.rcParams["axes.titlesize"] = 20
myblue = '#0b5394'

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.pipeline import Pipeline


#display all output form a cell not just the last (the options are 'all', 'none', 'last' and 'last_expr'.)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [6]:
data = pd.read_csv('../Data/reddit_data_clean.csv')

In [7]:
data

Unnamed: 0,subreddit,title,selftext,created_utc,author,num_comments,post_word_count,post_char_count,title_char_count,title_word_count
0,languagelearning,I would like to dedicate the rest of my 20s to...,(Age 24 from the USA) I've realized that the o...,1630453380,TheSweetOnion,25,298,1688,77,15
1,languagelearning,"Is it difficult because it's Korean, or is it ...",Thank you! Many of you answer my question. \n\...,1630450791,Altruistic-Ad-8788,2,36,249,95,16
2,languagelearning,Trouble focusing on learning languages.,I’m currently trying to focus on learning Kore...,1630449384,blackholesthrowaway,4,100,567,39,5
3,languagelearning,Any beginner Mandarin speakers who want to cre...,Im searching for a few beginners that learn Ch...,1630447140,zodiacbearexplorer,7,37,188,66,11
4,languagelearning,help me choose a language to learn!!,"Hi everyone, im an Asian studying in Israel fo...",1630446066,ashleyduong,5,103,540,36,7
...,...,...,...,...,...,...,...,...,...,...
4031,linguistics,Discuss the ways in which power is maintained ...,"Not sure why my previous post was erased, very...",1618946058,Myaccountgotbanned12,1,11,64,113,21
4032,linguistics,"What's going on with ""indie folk voice"", lingu...",I'm curious about an accent affected by some i...,1618942568,texastential_sm,52,70,448,65,9
4033,linguistics,Narrow Transcription of Polish Vowels,What is the narrow transcription of the Polish...,1618940848,CES0803,9,76,501,37,5
4034,linguistics,Discuss the ways in which power is maintained ...,Yes it’s a broad question but I’m interested t...,1618935470,Myaccountgotbanned12,2,12,68,128,22


In [78]:
#using selftext to classify
#train-test-split
X = data['selftext']
y = data['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)


cv = CountVectorizer(stop_words='english', ngram_range=(1,2), max_features=5000, min_df=2, max_df=0.95)
rf = RandomForestClassifier(max_depth=5, max_features='sqrt',random_state = 42)


X_train = cv.fit_transform(X_train)
X_test = cv.transform(X_test)
rf.fit(X_train, y_train)
rf.score(X_train,y_train)
rf.score(X_test, y_test)
pd.Series(rf.feature_importances_, cv.get_feature_names()).sort_values(ascending=False)


RandomForestClassifier(max_depth=5, max_features='sqrt')

0.8325074331020813

0.7849355797819624

learn              0.033665
time               0.030708
language           0.027105
learning           0.023817
linguistics        0.021934
                     ...   
headphones         0.000000
heads              0.000000
health             0.000000
hear difference    0.000000
ות                 0.000000
Length: 5000, dtype: float64

In [82]:
#random forest hyperparameter optimization
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('rf', RandomForestClassifier())    
])

pipe_params = {
    'cvec__max_features': [2000],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [0.9],
    'cvec__ngram_range': [(1,2)],
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [4, 5],
    'rf__max_features': ['sqrt', .5]
}

gs = GridSearchCV(
    estimator = pipe,
    param_grid = pipe_params,
    cv = 5,
    n_jobs = -1,
    verbose = 1
)

X = data['selftext']
y = data['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y, )

gs.fit(X_train, y_train)
gs.score(X_train, y_train)
gs.score(X_test, y_test)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('rf', RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'cvec__max_df': [0.9], 'cvec__max_features': [2000],
                         'cvec__min_df': [2, 3], 'cvec__ngram_range': [(1, 2)],
                         'rf__max_depth': [4, 5],
                         'rf__max_features': ['sqrt', 0.5],
                         'rf__n_estimators': [100, 200]},
             verbose=1)

0.8377931945820944

0.7849355797819624

In [83]:
gs.best_params_

{'cvec__max_df': 0.9,
 'cvec__max_features': 2000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 2),
 'rf__max_depth': 5,
 'rf__max_features': 'sqrt',
 'rf__n_estimators': 200}

In [86]:
#using title to classify
#train-test-split
X = data['title']
y = data['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

cv = CountVectorizer(stop_words='english', ngram_range=(1,2), max_features=2000, min_df=2, max_df=0.95)
rf = RandomForestClassifier(max_depth=5, max_features='sqrt', random_state = 42)
X_train = cv.fit_transform(X_train)
X_test = cv.transform(X_test)
rf.fit(X_train, y_train)
rf.score(X_train,y_train)
rf.score(X_test, y_test)
pd.Series(rf.feature_importances_, cv.get_feature_names()).sort_values(ascending=False)


RandomForestClassifier(max_depth=5, max_features='sqrt')

0.7641228939544104

0.7185332011892963

learn                0.082261
learning             0.046202
spanish              0.042341
linguistics          0.039559
language learning    0.039186
                       ...   
game                 0.000000
gaelic               0.000000
funny                0.000000
friends              0.000000
yt                   0.000000
Length: 2000, dtype: float64

In [127]:
#using title + selftext to classify
#train-test-split
X = data['title']+ ' ' + data['selftext']
y = data['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

cv = CountVectorizer(stop_words='english', ngram_range=(1,2), max_features=2000, min_df=2, max_df=0.95)
rf = RandomForestClassifier(max_depth=5, max_features='sqrt', random_state = 42)
X_train_featurized = cv.fit_transform(X_train)
X_test_featurized = cv.transform(X_test)
rf.fit(X_train_featurized, y_train)
rf.score(X_train_featurized,y_train)
rf.score(X_test_featurized, y_test)
important_features = pd.Series(rf.feature_importances_, cv.get_feature_names()).sort_values(ascending=False).head(50)
important_features

RandomForestClassifier(max_depth=5, max_features='sqrt', random_state=42)

0.8490254377271226

0.8077304261645193

learn                0.079852
learning             0.071937
language             0.051846
app                  0.030625
linguistics          0.026370
spanish              0.024545
listening            0.022876
want learn           0.017427
practice             0.016913
language learning    0.016742
learn language       0.015807
good                 0.015308
vocabulary           0.014392
vowel                0.014167
want                 0.012612
japanese             0.012447
duolingo             0.011758
day                  0.011149
target               0.010575
anki                 0.010500
time                 0.010342
level                0.009990
fluent               0.009746
learning language    0.008574
example              0.008055
start                0.008023
studying             0.007878
b1                   0.007728
examples             0.007192
german               0.007016
feel                 0.006946
target language      0.006915
tips                 0.006832
fluency   

In [134]:
#baseline model:


cv = CountVectorizer(stop_words='english', ngram_range=(1,2))

#vectorizing text column and making a DataFrame with the vectors
vect = cv.fit_transform(data['title']+data['selftext'])
vect_df = pd.DataFrame(vect.todense(), columns=cv.get_feature_names())
  

In [171]:

imp_words_dict = {'languagelearning' : [] , 'linguistics' : []}
for subreddit in imp_words_dict.keys():    
    imp_words_dict[subreddit] = vect_df[data['subreddit'] == subreddit].sum().sort_values(ascending=False).head(50).index.to_list()
    
imp_words_dict  

{'languagelearning': ['language',
  'learning',
  'learn',
  'english',
  'like',
  'spanish',
  'know',
  'just',
  'languages',
  've',
  'time',
  'don',
  'words',
  'french',
  'want',
  'native',
  'german',
  'really',
  'speak',
  'think',
  'people',
  'japanese',
  'good',
  'https',
  'help',
  'use',
  'new',
  'level',
  'speaking',
  'way',
  'amp',
  'reading',
  'read',
  'feel',
  'lot',
  'understand',
  'years',
  'grammar',
  'language learning',
  'vocabulary',
  'com',
  'italian',
  'word',
  'need',
  'start',
  'day',
  'study',
  'trying',
  'listening',
  'chinese'],
 'linguistics': ['language',
  'english',
  'languages',
  'like',
  'words',
  'know',
  'word',
  'just',
  'linguistics',
  'people',
  'https',
  've',
  'amp',
  'does',
  'different',
  'don',
  'question',
  'example',
  'use',
  'think',
  'sound',
  'say',
  'used',
  'post',
  'questions',
  'way',
  'spanish',
  'really',
  'native',
  'help',
  'did',
  'ask',
  'want',
  'german',
  

In [172]:
languagelearning_features = set(imp_words_dict['languagelearning'])-set(imp_words_dict['linguistics'])
linguistics_features = set(imp_words_dict['linguistics'])-set(imp_words_dict['languagelearning'])

#naive model


data['set_of_words'] = (data['title']+data['selftext']).str.split(' ')
data['model_1'] = data['set_of_words'].map(lambda x: 'languagelearning' if languagelearning_features.intersection(x) != set() else 'linguistics')
data['model_2'] = data['set_of_words'].map(lambda x: 'linguistics' if linguistics_features.intersection(x) != set() else 'languagelearning')

In [173]:
data[['subreddit', 'model_1', 'model_2']]
#model1 accuracy

for x in ['model_1', 'model_2']:
    print(x)
    (data[x] == data['subreddit']).value_counts(normalize=True)

Unnamed: 0,subreddit,model_1,model_2
0,languagelearning,languagelearning,linguistics
1,languagelearning,linguistics,languagelearning
2,languagelearning,languagelearning,languagelearning
3,languagelearning,languagelearning,linguistics
4,languagelearning,languagelearning,languagelearning
...,...,...,...
4031,linguistics,linguistics,linguistics
4032,linguistics,linguistics,linguistics
4033,linguistics,linguistics,languagelearning
4034,linguistics,linguistics,linguistics


model_1


True     0.711348
False    0.288652
dtype: float64

model_2


True     0.571853
False    0.428147
dtype: float64

In [174]:
#curve of accuracy with n-grams: