In [1]:
import numpy as np
import pandas as pd
import nltk
for dependency in ("brown", "names", "wordnet", "averaged_perceptron_tagger", "universal_tagset", "stopwords"):
    nltk.download(dependency)
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
 # https://github.com/EFord36/normalise
import normalise
import contractions
import re
import string
import random

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\yagne\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package names to
[nltk_data]     C:\Users\yagne\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yagne\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\yagne\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\yagne\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yagne\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to

In [2]:
args = {
    "lowercase": True,
    "stopwords": False,
    "normaliser": False,
    "remove_break_lines": True,
    "expand_contractions": True,
    "remove_punctuation": True,
    "remove_redditchars": True,
    "remove_urls": True,
    "lemmatize": False,
    "stem": False
}
translate_table = dict((ord(char), None) for char in string.punctuation + '‘’')   
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()


def preprocess(text, args=args, out=""):
    clean = text
    
    # Lowercase
    if args["lowercase"]:
        clean = clean.lower()

    # Remove /r/depression
    clean = re.sub(r'\/?r\/depression', '', clean)
    clean = re.sub(r'rdepression', '', clean)

    # Remove URLs
    if args["remove_urls"]:
        clean = re.sub(r'https?:\/\/\S+', '', clean)

    clean = clean.split()

    # Contraction Expansion
    if args["expand_contractions"]:
        clean = [contractions.fix(w) for w in clean]

    # Remove reddit specific characters
    if args["remove_redditchars"]:
        clean = [re.sub(r'\/r\/|\/u\/',"",w) for w in clean]

    # Remove punctuation
    if args["remove_punctuation"]:
        clean = [w.replace("-"," ") for w in clean]
        clean = [w.translate(translate_table) for w in clean]

    # Stopwords
    if args["stopwords"]:
        clean = [w for w in clean if w not in stopwords.words("english")]

    # Use normaliser package
    if args["normaliser"]:
        clean = normalise.normalise(clean, variety="AmE", verbose=False)

    # Lemmatizer
    if args["lemmatize"]:
        clean = [lemmatizer.lemmatize(w) for w in clean]

    if args["stem"]:
        clean = [stemmer.stem(w) for w in clean]

    # Final clean to remove any empty strings
    clean = " ".join(clean).split()
    clean = [w for w in clean if w != ""]

    # Output
    if out=="tokens":
        return clean
    else:
        clean = " ".join(clean)
        return clean




In [3]:
data = pd.read_csv("data/all_dep_data.csv")
data["full_text"] = data.apply(lambda x: x["title"] + " " + x["selftext"],axis=1)
data["clean_text"] = list(map(preprocess, data["full_text"]))

In [4]:
from sklearn import linear_model, feature_extraction, model_selection, naive_bayes, metrics

In [5]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(data["clean_text"], data["depressed"], test_size=0.10, random_state=42)
X_train.shape, X_test.shape

((31315,), (3480,))

In [6]:
min_docfreq = 10
ngram_range = (1,3)
cv = feature_extraction.text.CountVectorizer(min_df=min_docfreq, ngram_range=ngram_range, binary=True)

In [7]:
cv

CountVectorizer(binary=True, min_df=10, ngram_range=(1, 3))

In [8]:
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)


In [9]:
X_train_cv

<31315x108175 sparse matrix of type '<class 'numpy.int64'>'
	with 7202037 stored elements in Compressed Sparse Row format>

In [10]:
bnb = naive_bayes.BernoulliNB()
bnb.fit(X_train_cv,y_train)
y_pred = bnb.predict(X_test_cv)
feature_probs = bnb.feature_log_prob_[1,:]
feature_names = cv.get_feature_names()

In [11]:
print((metrics.classification_report(y_test, y_pred)))

              precision    recall  f1-score   support

           0       0.58      0.50      0.54      1760
           1       0.55      0.62      0.59      1720

    accuracy                           0.56      3480
   macro avg       0.57      0.56      0.56      3480
weighted avg       0.57      0.56      0.56      3480



In [130]:
feat_imp = pd.DataFrame(data=[feature_names,bnb.feature_log_prob_[0,:],bnb.feature_log_prob_[1,:]]).T
feat_imp.columns = ["term", "neg_proba_log", "pos_proba_log"]
feat_imp["log_diff"] = feat_imp["pos_proba_log"] - feat_imp["neg_proba_log"]
feat_imp.sort_values("log_diff", ascending=False)[:20]

Unnamed: 0,term,neg_proba_log,pos_proba_log,log_diff
22869,depression she,-9.6531,-6.95648,2.69662
32075,from the pain,-9.6531,-7.02547,2.62763
62901,nothing feels,-9.6531,-7.09958,2.55352
75339,school and do,-9.6531,-7.09958,2.55352
17214,but still did,-9.6531,-7.09958,2.55352
86072,the gut,-9.6531,-7.09958,2.55352
79468,some classes,-9.6531,-7.17963,2.47348
25453,dwelling,-9.6531,-7.17963,2.47348
88640,therapist told me,-9.6531,-7.17963,2.47348
82229,sum up,-9.6531,-7.17963,2.47348


In [114]:
min_docfreq = 1
ngram_range = (1,2)
cv = feature_extraction.text.CountVectorizer(min_df=min_docfreq, ngram_range=ngram_range, binary=False)
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

In [115]:
mnb = naive_bayes.MultinomialNB()
mnb.fit(X_train_cv,y_train)
y_pred = mnb.predict(X_test_cv)
print((metrics.classification_report(y_test, y_pred)))

              precision    recall  f1-score   support

           0       0.58      0.47      0.52      1760
           1       0.54      0.64      0.59      1720

    accuracy                           0.56      3480
   macro avg       0.56      0.56      0.56      3480
weighted avg       0.56      0.56      0.55      3480



In [12]:
# feat_imp = pd.DataFrame(data=[feature_names,mnb.feature_log_prob_[0,:],mnb.feature_log_prob_[1,:]]).T
# feat_imp.columns = ["term", "neg_proba_log", "pos_proba_log"]
# feat_imp["log_diff"] = feat_imp["pos_proba_log"] - feat_imp["neg_proba_log"]
# # feat_imp.sort_values("log_diff", ascending=False)[:20]

In [139]:
min_docfreq = 10
ngram_range = (1,3)
tfidf = feature_extraction.text.TfidfVectorizer(min_df=min_docfreq, ngram_range=ngram_range, norm='l2')

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [161]:
lr = linear_model.LogisticRegression(C=1, penalty="l1", solver="saga")
lr.fit(X_train_tfidf,y_train)
y_pred = lr.predict(X_test_tfidf)
print((metrics.classification_report(y_test, y_pred)))

              precision    recall  f1-score   support

           0       0.57      0.53      0.55      1760
           1       0.55      0.59      0.57      1720

    accuracy                           0.56      3480
   macro avg       0.56      0.56      0.56      3480
weighted avg       0.56      0.56      0.56      3480



In [162]:
feat_imp = pd.DataFrame(data=[tfidf.get_feature_names(),lr.coef_[0]]).T
feat_imp.columns = ["term", "coef"]
feat_imp.sort_values("coef", ascending=False)[:20]

Unnamed: 0,term,coef
22824,depression,12.2733
22783,depressed,4.58812
19390,christmas,3.26465
100790,way too,2.9667
77101,shit,2.89598
27377,everything,2.80194
27267,everyday,2.79177
46544,it does not,2.71952
56249,miss,2.65793
26535,escape,2.50336


In [164]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/