In [1]:
import numpy as np
import pandas as pd
import nltk
for dependency in ("wordnet",  "stopwords", "omw-1.4"):
    nltk.download(dependency)
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
 # https://github.com/EFord36/normalise
import contractions
import re
import string
import random
from sklearn import linear_model, feature_extraction, model_selection, naive_bayes, metrics, ensemble

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yagne\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yagne\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\yagne\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
args = {
    "lowercase": True,
    "stopwords": False,
    "remove_break_lines": True,
    "expand_contractions": True,
    "remove_punctuation": True,
    "remove_redditchars": True,
    "remove_urls": True,
    "lemmatize": False,
    "stem": False,
    "remove_numbers": False
}
translate_table = dict((ord(char), None) for char in string.punctuation + '-‘’')   
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess(text, out=""):
    clean = text
    
    # Lowercase
    if args["lowercase"]:
        clean = clean.lower()

    # Remove /r/depression from the test
    clean = re.sub(r"\/?r\/?depression", "", clean)


    # Remove URLs
    if args["remove_urls"]:
        clean = re.sub(r'https?:\/\/\S+', '', clean)

    clean = clean.split()

    # Contraction Expansion
    if args["expand_contractions"]:
        clean = [contractions.fix(w) for w in clean]

    # Remove reddit specific characters
    if args["remove_redditchars"]:
        clean = [re.sub(r'\/r\/|\/u\/',"",w) for w in clean]

    # Remove punctuation
    if args["remove_punctuation"]:
        clean = [w.translate(translate_table) for w in clean]

    if args["remove_numbers"]:
        clean = [w for w in clean if not w.isdigit()]

    # Stopwords
    if args["stopwords"]:
        clean = [w for w in clean if w not in stopwords.words("english")]


    # Lemmatizer
    if args["lemmatize"]:
        clean = [lemmatizer.lemmatize(w) for w in clean]

    if args["stem"]:
        clean = [stemmer.stem(w) for w in clean]

    # Final clean to remove any empty strings
    clean = " ".join(clean).split()
    clean = [w for w in clean if w != ""]

    # Output
    if out=="tokens":
        return clean
    else:
        clean = " ".join(clean)
        return clean




In [3]:
data = pd.read_csv("data/final_all_data.csv")
data["full_text"] = data.apply(lambda x: x["title"] + " " + x["selftext"],axis=1)

In [4]:
print(data.shape)

(127579, 11)


In [6]:
min_docfreq = 1
ngram_range = (1,1)
cv = feature_extraction.text.CountVectorizer(min_df=min_docfreq, ngram_range=ngram_range, binary=True)

args = {
    "lowercase": True,
    "stopwords": False,
    "remove_break_lines": True,
    "expand_contractions": True,
    "remove_punctuation": True,
    "remove_redditchars": True,
    "remove_urls": True,
    "lemmatize": False,
    "stem": False,
    "remove_numbers": True
}
data["clean_text"] = list(map(preprocess, data["full_text"]))
data_cv = cv.fit_transform(data["clean_text"])
len(cv.vocabulary_)

265018

In [7]:
ngram_range = (1,1)
min_docfreq=0.001
cv = feature_extraction.text.CountVectorizer(min_df=min_docfreq, ngram_range=ngram_range, binary=True)
data_cv = cv.fit_transform(data["clean_text"])
len(cv.vocabulary_)

4973

In [8]:
ngram_range = (2,2)
min_docfreq=1
cv = feature_extraction.text.CountVectorizer(min_df=min_docfreq, ngram_range=ngram_range, binary=True)
data_cv = cv.fit_transform(data["clean_text"])
len(cv.vocabulary_)

2493655

In [9]:
ngram_range = (2,2)
min_docfreq=0.001
cv = feature_extraction.text.CountVectorizer(min_df=min_docfreq, ngram_range=ngram_range, binary=True)
data_cv = cv.fit_transform(data["clean_text"])
len(cv.vocabulary_)

10575

In [63]:
args = {
    "lowercase": True,
    "stopwords": False,
    "remove_break_lines": True,
    "expand_contractions": True,
    "remove_punctuation": True,
    "remove_redditchars": True,
    "remove_urls": True,
    "lemmatize": True,
    "stem": False,
    "remove_numbers": True
}
data["clean_text"] = list(map(preprocess, data["full_text"]))

# ngram_range = (1,1)
# min_docfreq=1
# cv = feature_extraction.text.CountVectorizer(min_df=min_docfreq, ngram_range=ngram_range, binary=True)
# data_cv = cv.fit_transform(data["clean_text"])
# len(cv.vocabulary_)

In [11]:
ngram_range = (1,1)
min_docfreq=0.001
cv = feature_extraction.text.CountVectorizer(min_df=min_docfreq, ngram_range=ngram_range, binary=True)
data_cv = cv.fit_transform(data["clean_text"])
len(cv.vocabulary_)

4596

In [12]:
ngram_range = (2,2)
min_docfreq=1
cv = feature_extraction.text.CountVectorizer(min_df=min_docfreq, ngram_range=ngram_range, binary=True)
data_cv = cv.fit_transform(data["clean_text"])
len(cv.vocabulary_)

2344156

In [13]:
ngram_range = (2,2)
min_docfreq=0.001
cv = feature_extraction.text.CountVectorizer(min_df=min_docfreq, ngram_range=ngram_range, binary=True)
data_cv = cv.fit_transform(data["clean_text"])
len(cv.vocabulary_)

10807

In [14]:
args = {
    "lowercase": True,
    "stopwords": False,
    "remove_break_lines": True,
    "expand_contractions": True,
    "remove_punctuation": True,
    "remove_redditchars": True,
    "remove_urls": True,
    "lemmatize": False,
    "stem": True,
    "remove_numbers": True
}
data["clean_text"] = list(map(preprocess, data["full_text"]))

ngram_range = (1,1)
min_docfreq=1
cv = feature_extraction.text.CountVectorizer(min_df=min_docfreq, ngram_range=ngram_range, binary=True)
data_cv = cv.fit_transform(data["clean_text"])
len(cv.vocabulary_)

232326

In [15]:
ngram_range = (1,1)
min_docfreq=0.001
cv = feature_extraction.text.CountVectorizer(min_df=min_docfreq, ngram_range=ngram_range, binary=True)
data_cv = cv.fit_transform(data["clean_text"])
len(cv.vocabulary_)

3922

In [16]:
ngram_range = (2,2)
min_docfreq=1
cv = feature_extraction.text.CountVectorizer(min_df=min_docfreq, ngram_range=ngram_range, binary=True)
data_cv = cv.fit_transform(data["clean_text"])
len(cv.vocabulary_)

2115434

In [17]:
ngram_range = (2,2)
min_docfreq=0.001
cv = feature_extraction.text.CountVectorizer(min_df=min_docfreq, ngram_range=ngram_range, binary=True)
data_cv = cv.fit_transform(data["clean_text"])
len(cv.vocabulary_)

11421

In [77]:
args = {
    "lowercase": True,
    "stopwords": False,
    "remove_break_lines": True,
    "expand_contractions": True,
    "remove_punctuation": True,
    "remove_redditchars": True,
    "remove_urls": True,
    "lemmatize": True,
    "stem": True,
    "remove_numbers": True
}
data["clean_text"] = list(map(preprocess, data["full_text"]))

ngram_range = (1,1)
min_docfreq=1
cv = feature_extraction.text.CountVectorizer(min_df=min_docfreq, ngram_range=ngram_range, binary=True)
data_cv = cv.fit_transform(data["clean_text"])
len(cv.vocabulary_)

231985

In [35]:
ngram_range = (1,1)
min_docfreq=0.0001
cv = feature_extraction.text.CountVectorizer(min_df=min_docfreq, ngram_range=ngram_range, binary=True)
data_cv = cv.fit_transform(data["clean_text"])
len(cv.vocabulary_)

14730

In [20]:
ngram_range = (2,2)
min_docfreq=1
cv = feature_extraction.text.CountVectorizer(min_df=min_docfreq, ngram_range=ngram_range, binary=True)
data_cv = cv.fit_transform(data["clean_text"])
len(cv.vocabulary_)

2113093

In [23]:
ngram_range = (2,2)
min_docfreq=0.0001
cv = feature_extraction.text.CountVectorizer(min_df=min_docfreq, ngram_range=ngram_range, binary=True)
data_cv = cv.fit_transform(data["clean_text"])
len(cv.vocabulary_)

99557

In [32]:
cv.vocabulary_

{'ha': 1582,
 'anyon': 200,
 'made': 2083,
 'detail': 966,
 'of': 2383,
 'her': 1647,
 'at': 268,
 'each': 1108,
 'level': 1997,
 'am': 159,
 'rel': 2820,
 'new': 2311,
 'player': 2575,
 'and': 176,
 'right': 2893,
 'now': 2354,
 'do': 1036,
 'not': 2347,
 'realli': 2782,
 'know': 1932,
 'my': 2264,
 'so': 3157,
 'never': 2310,
 'if': 1735,
 'can': 560,
 'the': 3461,
 'enemi': 1172,
 'ani': 183,
 'given': 1504,
 'time': 3508,
 'game': 1468,
 'could': 812,
 'learn': 1977,
 'that': 3459,
 'with': 3840,
 'experi': 1259,
 'but': 536,
 'wa': 3743,
 'wonder': 3848,
 'somebodi': 3174,
 'alreadi': 151,
 'it': 1863,
 'thank': 3457,
 'you': 3894,
 'self': 3008,
 'center': 599,
 'arm': 233,
 'hurt': 1722,
 'thi': 3475,
 'morn': 2236,
 'pray': 2626,
 'does': 1042,
 'god': 1512,
 'is': 1859,
 'good': 1518,
 'lost': 2060,
 'key': 1913,
 'then': 3467,
 'found': 1420,
 'them': 3464,
 'piss': 2556,
 'me': 2137,
 'off': 2384,
 'fuck': 1449,
 'much': 2253,
 'what': 3793,
 'about': 48,
 'peopl': 2512,
 'w

In [33]:
ngram_range = (1,1)
min_docfreq=1
cv = feature_extraction.text.CountVectorizer(min_df=min_docfreq, ngram_range=ngram_range, binary=True)
data_cv = cv.fit_transform(data["clean_text"])
len(cv.vocabulary_)

231987

In [34]:
cv.vocabulary_

{'ha': 117552,
 'anyon': 51974,
 'made': 140433,
 'detail': 88236,
 'spreadsheet': 194370,
 'of': 155342,
 'her': 120392,
 'matchup': 142406,
 'at': 54287,
 'each': 95306,
 'level': 136243,
 'am': 49927,
 'rel': 177793,
 'new': 151513,
 'evelynn': 100396,
 'player': 163784,
 'and': 50704,
 'right': 180089,
 'now': 153943,
 'do': 90678,
 'not': 153616,
 'realli': 176419,
 'know': 133050,
 'my': 149447,
 'damagematchup': 84483,
 'so': 191872,
 'never': 151482,
 'if': 124596,
 'can': 70057,
 '1v1': 8362,
 'the': 202527,
 'enemi': 98615,
 'jungler': 130790,
 'ani': 51009,
 'given': 113560,
 'time': 204236,
 'game': 111637,
 'could': 79609,
 'learn': 135489,
 'that': 202433,
 'with': 225230,
 'experi': 101331,
 'but': 67084,
 'wa': 221709,
 'wonder': 225595,
 'somebodi': 192486,
 'alreadi': 49687,
 'it': 128454,
 'thank': 202387,
 'you': 228522,
 'self': 186087,
 'center': 72556,
 'arm': 53199,
 'hurt': 123494,
 'thi': 203013,
 'morn': 147796,
 'pray': 166023,
 'does': 90850,
 'god': 114196

In [78]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(data["clean_text"], data["depression"], test_size=0.10, random_state=42)
X_train.shape, X_test.shape

((114821,), (12758,))

In [79]:
ngram_range = (1,2)
min_docfreq=0.0001
cv = feature_extraction.text.CountVectorizer(min_df=min_docfreq, ngram_range=ngram_range, binary=True)

In [80]:
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)


In [81]:
X_train_cv

<114821x112335 sparse matrix of type '<class 'numpy.int64'>'
	with 16496820 stored elements in Compressed Sparse Row format>

In [82]:
bnb = naive_bayes.BernoulliNB()
bnb.fit(X_train_cv,y_train)
y_pred = bnb.predict(X_test_cv)
feature_probs = bnb.feature_log_prob_[1,:]
feature_names = cv.get_feature_names_out()

In [83]:
print((metrics.classification_report(y_test, y_pred)))

              precision    recall  f1-score   support

         0.0       0.81      0.48      0.60      7677
         1.0       0.51      0.83      0.63      5081

    accuracy                           0.62     12758
   macro avg       0.66      0.65      0.62     12758
weighted avg       0.69      0.62      0.61     12758



In [84]:
feat_imp = pd.DataFrame(data=[feature_names,bnb.feature_log_prob_[0,:],bnb.feature_log_prob_[1,:]]).T
feat_imp.columns = ["term", "neg_proba_log", "pos_proba_log"]
feat_imp["log_diff"] = feat_imp["pos_proba_log"] - feat_imp["neg_proba_log"]
feat_imp.sort_values("log_diff", ascending=False)[:20]

Unnamed: 0,term,neg_proba_log,pos_proba_log,log_diff
93795,the represent,-11.152214,-7.003221,4.148993
45339,ident bodi,-11.152214,-7.027913,4.124301
21437,consid enter,-11.152214,-7.053231,4.098983
6194,and essay,-11.152214,-7.053231,4.098983
59385,more entri,-11.152214,-7.053231,4.098983
110578,writer pleas,-11.152214,-7.053231,4.098983
110574,writer at,-11.152214,-7.053231,4.098983
75406,publish onlin,-11.152214,-7.053231,4.098983
75160,promot voic,-11.152214,-7.079207,4.073007
99445,to racial,-11.152214,-7.079207,4.073007


In [85]:
ngram_range = (1,2)
min_docfreq=0.0001
cv = feature_extraction.text.CountVectorizer(min_df=min_docfreq, ngram_range=ngram_range, binary=False)
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

In [86]:
mnb = naive_bayes.MultinomialNB()
mnb.fit(X_train_cv,y_train)
y_pred = mnb.predict(X_test_cv)
print((metrics.classification_report(y_test, y_pred)))

              precision    recall  f1-score   support

         0.0       0.93      0.38      0.54      7677
         1.0       0.51      0.96      0.66      5081

    accuracy                           0.61     12758
   macro avg       0.72      0.67      0.60     12758
weighted avg       0.76      0.61      0.59     12758



In [87]:
feat_imp = pd.DataFrame(data=[feature_names,mnb.feature_log_prob_[0,:],mnb.feature_log_prob_[1,:]]).T
feat_imp.columns = ["term", "neg_proba_log", "pos_proba_log"]
feat_imp["log_diff"] = feat_imp["pos_proba_log"] - feat_imp["neg_proba_log"]
feat_imp.sort_values("log_diff", ascending=False)[:20]

Unnamed: 0,term,neg_proba_log,pos_proba_log,log_diff
110578,writer pleas,-16.323268,-11.731541,4.591727
75406,publish onlin,-16.323268,-11.731541,4.591727
90466,ten honor,-16.323268,-11.744786,4.578482
101703,twelv entri,-16.323268,-11.744786,4.578482
21626,contest with,-16.323268,-11.744786,4.578482
100632,top twelv,-16.323268,-11.744786,4.578482
79136,robot right,-16.323268,-12.233633,4.089635
6194,and essay,-16.323268,-12.324605,3.998663
93795,the represent,-16.323268,-12.348702,3.974566
45339,ident bodi,-16.323268,-12.373395,3.949873


In [88]:
ngram_range = (1,2)
min_docfreq=0.0001
tfidf = feature_extraction.text.TfidfVectorizer(min_df=min_docfreq, ngram_range=ngram_range, norm='l2')

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [89]:
lr = linear_model.LogisticRegression(C=1, penalty="l1", solver="saga")
lr.fit(X_train_tfidf,y_train)
y_pred = lr.predict(X_test_tfidf)
print((metrics.classification_report(y_test, y_pred)))

              precision    recall  f1-score   support

         0.0       0.71      0.75      0.73      7677
         1.0       0.59      0.54      0.56      5081

    accuracy                           0.67     12758
   macro avg       0.65      0.65      0.65     12758
weighted avg       0.66      0.67      0.67     12758





In [90]:
feat_imp = pd.DataFrame(data=[tfidf.get_feature_names(),lr.coef_[0]]).T
feat_imp.columns = ["term", "coef"]
feat_imp.sort_values("coef", ascending=False)[:20]



Unnamed: 0,term,coef
24476,depress,7.59011
110696,ww,5.456857
106442,wellbutrin,4.411308
13276,be yourself,4.341972
70041,other would,4.016499
24040,deathcor,3.781481
57739,memantin,3.705728
55434,lt3,3.565513
62907,new thing,3.545163
59873,motion,3.39585


In [93]:
feat_imp.sort_values("coef", ascending=True)[:30]

Unnamed: 0,term,coef
9141,appli or,-21.731757
35799,full report,-14.859914
56276,map full,-14.859914
82877,shot assist,-12.753912
101371,trip report,-12.345552
57460,meaning,-9.675627
59422,more here,-8.88653
74634,previou discuss,-8.653623
74270,power zscore,-7.284777
112280,zscore,-7.284777


In [107]:
rf = ensemble.RandomForestClassifier(n_estimators=300, max_depth=int(np.log2(X_train.shape[0])*2), max_features="sqrt", n_jobs=-1)
rf.fit(X_train_tfidf,y_train)
y_pred = rf.predict(X_test_tfidf)
print((metrics.classification_report(y_test, y_pred)))

              precision    recall  f1-score   support

         0.0       0.67      0.86      0.75      7677
         1.0       0.62      0.36      0.45      5081

    accuracy                           0.66     12758
   macro avg       0.64      0.61      0.60     12758
weighted avg       0.65      0.66      0.63     12758



In [111]:
feat_imp = pd.DataFrame(data={"feature": tfidf.get_feature_names_out(), "importance": rf.feature_importances_})
feat_imp.sort_values("importance", ascending=False)[:20]

Unnamed: 0,feature,importance
104930,wa wow,0.013205
56269,map,0.012738
88051,strang,0.01229
10348,artifact collect,0.012257
102293,unit state,0.011955
38095,googl earth,0.010405
101343,trip,0.010193
76458,rate,0.00999
78237,report,0.00974
38100,googl map,0.009342


In [164]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [11]:
data

Unnamed: 0,author,selftext,created_utc,subreddit,subreddit_subscribers,title,date,year-month,post_length,depression,full_text
0,--PunPun--,I'm a relatively new Evelynn player and right ...,1587305577,EvelynnMains,14681,Has anyone made a detailed spreadsheet of her ...,2020-04-19 10:12:57,2020-04,51,1.0,Has anyone made a detailed spreadsheet of her ...
1,--jyushimatsudesu,"""my arm hurt this morning and i prayed and now...",1552057727,exchristian,48153,so self centered,2019-03-08 10:08:47,2019-03,88,1.0,"so self centered ""my arm hurt this morning and..."
2,--jyushimatsudesu,"(warning : very sappy, i'm sorry, she's just g...",1552258247,teenagers,1226850,letter to my crush ♥,2019-03-10 18:50:47,2019-03,186,1.0,"letter to my crush ♥ (warning : very sappy, i'..."
3,--jyushimatsudesu,"being watched, 24/7?? like, imagine if someone...",1553425235,exchristian,48201,do Christians actually think about this?,2019-03-24 07:00:35,2019-03,117,1.0,do Christians actually think about this? being...
4,--jyushimatsudesu,i don't mean what happens when you feel you're...,1554040237,TooAfraidToAsk,578671,who else suddenly jolts awake from a light sle...,2019-03-31 09:50:37,2019-03,51,1.0,who else suddenly jolts awake from a light sle...
...,...,...,...,...,...,...,...,...,...,...,...
127574,zzspecialty,"I'm a 5' 11 23 year old, average weight and se...",1599439665,dirtyr4r,542148,23 [M4F]- Florida- Casual flirting/open relati...,2020-09-06 20:47:45,2020-09,89,0.0,23 [M4F]- Florida- Casual flirting/open relati...
127575,zzspecialty,So let's skip the shaming of cheating. We all ...,1600748511,sex,1882945,The urge to have sexual relations with another...,2020-09-22 00:21:51,2020-09,168,0.0,The urge to have sexual relations with another...
127576,zzzgfr,Early this morning I woke up and hallucinated/...,1564619771,Sleepparalysis,16800,Is it possible to be only partially paralyzed?,2019-07-31 20:36:11,2019-08,79,0.0,Is it possible to be only partially paralyzed?...
127577,zzzgfr,I attempted to force myself into sp the other ...,1565838037,Sleepparalysis,16688,Should my body tingle before entering paralysis,2019-08-14 23:00:37,2019-08,79,0.0,Should my body tingle before entering paralysi...
