In [1]:
#imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

# seaborn plot styles

sns.set_style("dark")
sns.set_palette("deep")
plt.rcParams["axes.labelsize"] = 15
plt.rcParams["axes.titlesize"] = 20
myblue = '#0b5394'

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score

#display all output form a cell not just the last (the options are 'all', 'none', 'last' and 'last_expr'.)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# reading in already cleaned data

data = pd.read_csv('../Data/reddit_data_clean.csv')
data.head(2)
data.shape
data.groupby('subreddit').count()

Unnamed: 0,subreddit,title,selftext,created_utc,author,num_comments,post_word_count,post_char_count,title_char_count,title_word_count
0,languagelearning,I would like to dedicate the rest of my 20s to...,(Age 24 from the USA) I've realized that the o...,1630453380,TheSweetOnion,25,298,1688,77,15
1,languagelearning,"Is it difficult because it's Korean, or is it ...",Thank you! Many of you answer my question. \n\...,1630450791,Altruistic-Ad-8788,2,36,249,95,16


(4036, 10)

Unnamed: 0_level_0,title,selftext,created_utc,author,num_comments,post_word_count,post_char_count,title_char_count,title_word_count
subreddit,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
languagelearning,2015,2015,2015,2015,2015,2015,2015,2015,2015
linguistics,2021,2021,2021,2021,2021,2021,2021,2021,2021


## Baseline model

This section involves building two simple baseline models by considering the most common words that occur in one of the subreddits but not the other.

In [4]:
#create feature and target dfs

X = data['title'] + ' ' + data['selftext']
# encoding languagelearning subreddit as 0 and linguistics as 1
y = data['subreddit'].replace(['languagelearning', 'linguistics'], [0,1]) 


#train-test-split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2, stratify=y)

#featurize text columns using count vectorizer
cv = CountVectorizer(stop_words='english', ngram_range=(1,2))
X_featurized_train = cv.fit_transform(X_train)
X_featurized_test = cv.transform(X_test)

X_featurized_train_df = pd.DataFrame(X_featurized_train.todense(), columns=cv.get_feature_names())
X_featurized_test_df = pd.DataFrame(X_featurized_test.todense(), columns=cv.get_feature_names())



In [8]:
# creating a dictionary of most frequent words in each subreddit

imp_words_dict = {0 : [] , 1 : []}
for subreddit in imp_words_dict.keys():    
    imp_words_dict[subreddit] = X_featurized_train_df[y_train.reset_index(drop = True) == subreddit].sum().sort_values(ascending=False).head(50).index.to_list()
    
#creating sets of distinctive words that occur in one subreddit but not the other

languagelearning_distinctive_words = set(imp_words_dict[0])-set(imp_words_dict[1])
linguistics_distinctive_words = set(imp_words_dict[1])-set(imp_words_dict[0])

In [9]:
imp_words_dict
languagelearning_distinctive_words
linguistics_distinctive_words

{0: ['language',
  'learning',
  'learn',
  'english',
  'like',
  'spanish',
  'know',
  'languages',
  'just',
  'time',
  've',
  'don',
  'words',
  'french',
  'want',
  'native',
  'really',
  'german',
  'speak',
  'think',
  'japanese',
  'https',
  'good',
  'help',
  'people',
  'use',
  'new',
  'amp',
  'level',
  'way',
  'speaking',
  'reading',
  'feel',
  'read',
  'lot',
  'understand',
  'years',
  'com',
  'start',
  'language learning',
  'grammar',
  'italian',
  'vocabulary',
  'word',
  'day',
  'need',
  'studying',
  'chinese',
  'study',
  'listening'],
 1: ['language',
  'english',
  'like',
  'languages',
  'words',
  'know',
  'word',
  'https',
  'just',
  'people',
  'linguistics',
  've',
  'amp',
  'does',
  'don',
  'question',
  'different',
  'use',
  'example',
  'think',
  'sound',
  'say',
  'used',
  'post',
  'native',
  'way',
  'spanish',
  'questions',
  'help',
  'speakers',
  'german',
  'really',
  'com',
  'did',
  'gt',
  'ask',
  'greek

{'chinese',
 'day',
 'feel',
 'good',
 'grammar',
 'italian',
 'japanese',
 'language learning',
 'learn',
 'learning',
 'level',
 'listening',
 'need',
 'reading',
 'speaking',
 'start',
 'study',
 'studying',
 'vocabulary',
 'years'}

{'accent',
 'ask',
 'did',
 'different',
 'does',
 'example',
 'greek',
 'gt',
 'linguistics',
 'looking',
 'make',
 'post',
 'question',
 'questions',
 'say',
 'sound',
 'sounds',
 'speakers',
 'used',
 'wikipedia'}

In [100]:
#model_1 accuracy
    
train_preds = X_featurized_train_df[imp_words_dict[0]].sum(axis = 1).map(
    lambda x : 0 if x>0 else 1)
test_preds = X_featurized_test_df[imp_words_dict[0]].sum(axis = 1).map(
    lambda x : 0 if x>0 else 1)

print(f'model1 training accuracy: {accuracy_score(y_train, train_preds)}')
print(f'model1 testing accuracy: {accuracy_score(y_test, test_preds)}')

#model_2 accuracy
train_preds = X_featurized_train_df[imp_words_dict[1]].sum(axis = 1).map(
    lambda x : 1 if x>0 else 0)
test_preds = X_featurized_test_df[imp_words_dict[1]].sum(axis = 1).map(
    lambda x : 1 if x>0 else 0)

print(f'model2 training accuracy: {accuracy_score(y_train, train_preds)}')
print(f'model2 testing accuracy: {accuracy_score(y_test, test_preds)}')

model1 training accuracy: 0.5156921043937892
model1 testing accuracy: 0.5222993062438057
model2 training accuracy: 0.4968615791212422
model2 testing accuracy: 0.4975222993062438


In [78]:
#using selftext to classify
#train-test-split
X = data['selftext']
y = data['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)


cv = CountVectorizer(stop_words='english', ngram_range=(1,2), max_features=5000, min_df=2, max_df=0.95)
rf = RandomForestClassifier(max_depth=5, max_features='sqrt',random_state = 42)


X_train = cv.fit_transform(X_train)
X_test = cv.transform(X_test)
rf.fit(X_train, y_train)
rf.score(X_train,y_train)
rf.score(X_test, y_test)
pd.Series(rf.feature_importances_, cv.get_feature_names()).sort_values(ascending=False)


RandomForestClassifier(max_depth=5, max_features='sqrt')

0.8325074331020813

0.7849355797819624

learn              0.033665
time               0.030708
language           0.027105
learning           0.023817
linguistics        0.021934
                     ...   
headphones         0.000000
heads              0.000000
health             0.000000
hear difference    0.000000
ות                 0.000000
Length: 5000, dtype: float64

In [82]:
#random forest hyperparameter optimization
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('rf', RandomForestClassifier())    
])

pipe_params = {
    'cvec__max_features': [2000],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [0.9],
    'cvec__ngram_range': [(1,2)],
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [4, 5],
    'rf__max_features': ['sqrt', .5]
}

gs = GridSearchCV(
    estimator = pipe,
    param_grid = pipe_params,
    cv = 5,
    n_jobs = -1,
    verbose = 1
)

X = data['selftext']
y = data['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y, )

gs.fit(X_train, y_train)
gs.score(X_train, y_train)
gs.score(X_test, y_test)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('rf', RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'cvec__max_df': [0.9], 'cvec__max_features': [2000],
                         'cvec__min_df': [2, 3], 'cvec__ngram_range': [(1, 2)],
                         'rf__max_depth': [4, 5],
                         'rf__max_features': ['sqrt', 0.5],
                         'rf__n_estimators': [100, 200]},
             verbose=1)

0.8377931945820944

0.7849355797819624

In [83]:
gs.best_params_

{'cvec__max_df': 0.9,
 'cvec__max_features': 2000,
 'cvec__min_df': 2,
 'cvec__ngram_range': (1, 2),
 'rf__max_depth': 5,
 'rf__max_features': 'sqrt',
 'rf__n_estimators': 200}

In [86]:
#using title to classify
#train-test-split
X = data['title']
y = data['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

cv = CountVectorizer(stop_words='english', ngram_range=(1,2), max_features=2000, min_df=2, max_df=0.95)
rf = RandomForestClassifier(max_depth=5, max_features='sqrt', random_state = 42)
X_train = cv.fit_transform(X_train)
X_test = cv.transform(X_test)
rf.fit(X_train, y_train)
rf.score(X_train,y_train)
rf.score(X_test, y_test)
pd.Series(rf.feature_importances_, cv.get_feature_names()).sort_values(ascending=False)


RandomForestClassifier(max_depth=5, max_features='sqrt')

0.7641228939544104

0.7185332011892963

learn                0.082261
learning             0.046202
spanish              0.042341
linguistics          0.039559
language learning    0.039186
                       ...   
game                 0.000000
gaelic               0.000000
funny                0.000000
friends              0.000000
yt                   0.000000
Length: 2000, dtype: float64

In [10]:
#using title + selftext to classify
#train-test-split
X = data['title']+ ' ' + data['selftext']
y = data['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

cv = CountVectorizer(stop_words='english', ngram_range=(1,2), max_features=5000, min_df=2, max_df=0.95)
rf = RandomForestClassifier(max_depth=5, max_features='sqrt', random_state = 42)
X_train_featurized = cv.fit_transform(X_train)
X_test_featurized = cv.transform(X_test)
rf.fit(X_train_featurized, y_train)
rf.score(X_train_featurized,y_train)
rf.score(X_test_featurized, y_test)
important_features = pd.Series(rf.feature_importances_, cv.get_feature_names()).sort_values(ascending=False).head(50)
important_features

RandomForestClassifier(max_depth=5, max_features='sqrt', random_state=42)

0.85596299966964

0.817641228939544

learning              0.059523
language              0.040630
learn                 0.035188
want                  0.030868
app                   0.023439
linguistics           0.021311
level                 0.020917
spanish               0.020638
examples              0.018316
tips                  0.016747
language learning     0.016394
vocabulary            0.014840
b1                    0.014791
time                  0.013894
day                   0.013357
months                0.013090
linguistic            0.011835
japanese              0.010314
week                  0.010287
want learn            0.010034
speak                 0.009602
learn language        0.009309
target language       0.009149
french                0.008641
anki                  0.008517
fluent                0.008473
trying                0.008378
good                  0.007937
pronounced            0.007476
watch                 0.007206
vowel                 0.007105
duolingo              0.006890
research

In [102]:
#baseline model:


cv = CountVectorizer(stop_words='english', ngram_range=(1,2))

#vectorizing text column and making a DataFrame with the vectors
vect = cv.fit_transform(data['title'] + ' ' + data['selftext'])
vect_df = pd.DataFrame(vect.todense(), columns=cv.get_feature_names())
  

In [103]:

imp_words_dict = {'languagelearning' : [] , 'linguistics' : []}
for subreddit in imp_words_dict.keys():    
    imp_words_dict[subreddit] = vect_df[data['subreddit'] == subreddit].sum().sort_values(ascending=False).head(50).index.to_list()
    
imp_words_dict  

{'languagelearning': ['language',
  'learning',
  'learn',
  'english',
  'like',
  'spanish',
  'languages',
  'know',
  'just',
  've',
  'time',
  'don',
  'words',
  'french',
  'want',
  'native',
  'german',
  'really',
  'speak',
  'think',
  'japanese',
  'people',
  'good',
  'help',
  'https',
  'use',
  'level',
  'new',
  'speaking',
  'way',
  'amp',
  'reading',
  'read',
  'feel',
  'lot',
  'understand',
  'years',
  'grammar',
  'language learning',
  'vocabulary',
  'com',
  'italian',
  'word',
  'need',
  'start',
  'day',
  'study',
  'trying',
  'listening',
  'chinese'],
 'linguistics': ['language',
  'english',
  'languages',
  'like',
  'words',
  'know',
  'word',
  'just',
  'linguistics',
  'people',
  'https',
  've',
  'amp',
  'does',
  'different',
  'question',
  'don',
  'example',
  'use',
  'think',
  'sound',
  'say',
  'used',
  'post',
  'questions',
  'way',
  'spanish',
  'help',
  'really',
  'native',
  'ask',
  'did',
  'want',
  'german',
  

In [104]:
languagelearning_features = set(imp_words_dict['languagelearning'])-set(imp_words_dict['linguistics'])
linguistics_features = set(imp_words_dict['linguistics'])-set(imp_words_dict['languagelearning'])

#naive model


data['set_of_words'] = (data['title']+ ' ' + data['selftext']).str.split(' ')
data['model_1'] = data['set_of_words'].map(lambda x: 'languagelearning' if languagelearning_features.intersection(x) != set() else 'linguistics')
data['model_2'] = data['set_of_words'].map(lambda x: 'linguistics' if linguistics_features.intersection(x) != set() else 'languagelearning')

In [105]:
data[['subreddit', 'model_1', 'model_2']]
#model1 accuracy

for x in ['model_1', 'model_2']:
    print(x)
    (data[x] == data['subreddit']).value_counts(normalize=True)

Unnamed: 0,subreddit,model_1,model_2
0,languagelearning,languagelearning,linguistics
1,languagelearning,linguistics,languagelearning
2,languagelearning,languagelearning,languagelearning
3,languagelearning,languagelearning,linguistics
4,languagelearning,languagelearning,languagelearning
...,...,...,...
4031,linguistics,linguistics,linguistics
4032,linguistics,linguistics,linguistics
4033,linguistics,linguistics,languagelearning
4034,linguistics,linguistics,linguistics


model_1


True     0.705897
False    0.294103
dtype: float64

model_2


True     0.579286
False    0.420714
dtype: float64

In [174]:
#curve of accuracy with n-grams: