In [13]:
import re
import string
import unicodedata
import pickle
from collections import Counter
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
from scipy.sparse import hstack

import contractions

import nltk
from nltk.corpus import stopwords

import spacy

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS

from sklearn import (
    linear_model,
    feature_extraction,
    model_selection,
    naive_bayes,
    metrics,
    ensemble,
)

import pyLDAvis
import pyLDAvis.gensim_models

import hypopt


In [14]:
data = pd.read_csv("data/final_all_data.csv")
data["full_text"] = data.apply(lambda x: x["title"] + " " + x["selftext"], axis=1)
# Drop strange subreddit that is very represented in the data but has only 4.2k members.
# data = data.loc[data["subreddit"] != "randonaut_reports"].reset_index(drop=True)

# Drop any leakage in the data from depression-related subreddits
data = data.loc[
    ~data["subreddit"].isin(
        [
            "randonaut_reports",
            "depression",
            "SuicideWatch",
            "depression_help",
            "depressed",
        ]
    )
].reset_index(drop=True)

data.shape



(104061, 11)

In [16]:
data.groupby("depression")['author'].nunique()

depression
0.0    14263
1.0    15353
Name: author, dtype: int64

In [17]:
15353/(14263+15353)

0.5184022150189087

In [4]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    data["full_text"], data["depression"], test_size=0.10, random_state=42
)
X_train, X_val, y_train, y_val = model_selection.train_test_split(
    X_train, y_train, test_size=0.10, random_state=42
)

print("Train, Val, Test Sizes")
print(y_train.shape, y_val.shape, y_test.shape)


Train, Val, Test Sizes
(84288,) (9366,) (10407,)


In [5]:
translate_table = dict((ord(char), None) for char in string.punctuation + "‘’")
# Pre-processing code comes from modification of Gensim's simple_tokenize to include numbers and remove punctuation
# https://github.com/RaRe-Technologies/gensim/blob/351456b4f7d597e5a4522e71acedf785b2128ca1/gensim/utils.py#L288


def to_unicode(text):
    if isinstance(text, str):
        return text.encode("latin", "ignore").decode("utf-8", "ignore")
    a = str(text, "ut8")
    return a.encode("latin", "ignore").decode("utf-8", "ignore")

def preprocess(text):
    text = to_unicode(text)
    if lowercase:
        text = text.lower()

    if deaccent:
        text = unicodedata.normalize("NFD", text)
        text = "".join(ch for ch in text if unicodedata.category(ch) != "Mn")
        text = unicodedata.normalize("NFC", text)

    # Remove /r/depression
    if remove_leaks:
        text = re.sub(r"\/?r\/?depression", "", text)
    # Remove URL
    if remove_urls:
        text = re.sub(r"https?:\/\/\S+", "", text)

    if remove_reddit_chars:
        text = re.sub(r"\/r\/|\/u\/", "", text)

    if expand_contractions:
        text = contractions.fix(text)

    # Remove punctuation
    if remove_punctuation:
        text = text.replace("-", " ")
        text = text.translate(translate_table)

    text = text.split()

    if remove_numbers:
        text = [w for w in text if not w.isdigit()]

    return [token for token in text if min_len <= len(token) and max_len >= len(token)]


min_len = 1
max_len = 15

lowercase = True
deaccent = True
remove_leaks = True
remove_urls = True
remove_reddit_chars = True
expand_contractions = True
remove_punctuation = True
remove_numbers = False


In [6]:
X_train_clean = list(map(preprocess, X_train))
X_val_clean = list(map(preprocess, X_val))
X_test_clean = list(map(preprocess, X_test))


In [7]:
with open("positive-words.txt", "r") as file:
    positives = file.readlines()

positives = [w.replace("\n","") for w in positives]

with open("negative-words.txt", "r") as file:
    negatives = file.readlines()

negatives = [w.replace("\n","") for w in negatives]


In [8]:
train_clean = pd.DataFrame(data = {"y": y_train,
    "real_index": X_train.index,
    "cleaned_text": X_train_clean
})
train_clean["post_length"] = train_clean["cleaned_text"].apply(lambda x: len(x))

def dummy(doc):
    return doc

wc_cv = feature_extraction.text.CountVectorizer(
    binary=False,
    tokenizer=dummy,
    preprocessor=dummy,
)

wc_cv = wc_cv.fit(train_clean["cleaned_text"])
train_wc = wc_cv.transform(train_clean["cleaned_text"])




In [9]:
def cv_lookup(mat, cv, references):
    holder = np.zeros(shape=(mat.shape[0],1))

    for r in references:
      try:
        holder = holder + mat[:,cv.vocabulary_[r]].toarray()
      except:
        continue
    return holder
train_clean["first_person_singular"] = cv_lookup(train_wc, wc_cv, ["i","me","mine", "myself"])
train_clean["first_person_plural"] = cv_lookup(train_wc, wc_cv, ["we","us","ours", "our", "ourselves"])
train_clean["positive_words"] = cv_lookup(train_wc, wc_cv,positives)
train_clean["negative_words"] = cv_lookup(train_wc, wc_cv,negatives)

train_clean["fs_ratio"] = train_clean["first_person_singular"]/train_clean["post_length"]
train_clean["fp_ratio"] = train_clean["first_person_plural"]/train_clean["post_length"]

train_clean["pos_ratio"] = train_clean["positive_words"]/train_clean["post_length"]
train_clean["neg_ratio"] = train_clean["negative_words"]/train_clean["post_length"]

In [10]:
train_clean.groupby("y").mean()

Unnamed: 0_level_0,real_index,post_length,first_person_singular,first_person_plural,positive_words,negative_words,fs_ratio,fp_ratio,pos_ratio,neg_ratio
y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.0,76074.578217,128.7122,7.016492,0.65595,4.17805,4.049828,0.052863,0.004766,0.033055,0.030092
1.0,23996.356488,133.078223,9.255127,0.576729,4.329331,5.024168,0.067874,0.003783,0.033023,0.036816


In [11]:
train_clean.groupby("y").std()

Unnamed: 0_level_0,real_index,post_length,first_person_singular,first_person_plural,positive_words,negative_words,fs_ratio,fp_ratio,pos_ratio,neg_ratio
y,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0.0,16162.94302,87.032999,7.545316,1.688515,4.008631,4.541178,0.037651,0.011822,0.024961,0.02724
1.0,13842.784219,90.277316,8.509657,1.584321,4.064876,5.282905,0.037193,0.009978,0.023824,0.029765


In [12]:
round(((0.003783-0.004766)/0.004766)*100,2)


-20.63

In [None]:
val_clean = pd.DataFrame(data = {"real_index": X_val.index, "cleaned_text": X_val_clean})
val_clean["post_length"] = val_clean["cleaned_text"].apply(lambda x: len(x))

val_wc = wc_cv.transform(val_clean["cleaned_text"])

val_clean["first_person_singular"] = cv_lookup(val_wc, wc_cv, ["i","me","mine", "myself"])
val_clean["first_person_plural"] = cv_lookup(val_wc, wc_cv, ["we","us","ours","ourselves"])
val_clean["positive_words"] = cv_lookup(val_wc, wc_cv,positives)
val_clean["negative_words"] = cv_lookup(val_wc, wc_cv,negatives)

val_clean["fs_ratio"] = val_clean["first_person_singular"]/val_clean["post_length"]
val_clean["fp_ratio"] = val_clean["first_person_plural"]/val_clean["post_length"]
val_clean["pos_ratio"] = val_clean["positive_words"]/val_clean["post_length"]
val_clean["neg_ratio"] = val_clean["negative_words"]/val_clean["post_length"]

test_clean = pd.DataFrame(data = {"real_index": X_test.index, "cleaned_text": X_test_clean})
test_clean["post_length"] = test_clean["cleaned_text"].apply(lambda x: len(x))
test_wc = wc_cv.transform(test_clean["cleaned_text"])


test_clean["first_person_singular"] = cv_lookup(test_wc, wc_cv, ["i","me","mine", "myself"])
test_clean["first_person_plural"] = cv_lookup(test_wc, wc_cv, ["we","us","ours","ourselves"])
test_clean["positive_words"] = cv_lookup(test_wc, wc_cv,positives)
test_clean["negative_words"] = cv_lookup(test_wc, wc_cv,negatives)

test_clean["fs_ratio"] = test_clean["first_person_singular"]/test_clean["post_length"]
test_clean["fp_ratio"] = test_clean["first_person_plural"]/test_clean["post_length"]
test_clean["pos_ratio"] = test_clean["positive_words"]/test_clean["post_length"]
test_clean["neg_ratio"] = test_clean["negative_words"]/test_clean["post_length"]

In [None]:
cleaned_df = pd.concat([train_clean.drop("y", axis=1),val_clean,test_clean])
cleaned_df = cleaned_df.sort_values("real_index").reset_index(drop=True)
cleaned_df["cleaned_text"] = cleaned_df["cleaned_text"].apply(lambda x: " ".join(x))

In [None]:
pd.concat([data, cleaned_df[["cleaned_text", 'first_person_singular',
       'first_person_plural', 'positive_words', 'negative_words', 'fs_ratio',
       'fp_ratio', 'pos_ratio', 'neg_ratio']]],axis =1).to_csv("data/word_count.csv", index=False)