# Analysis of outwardly depressive mood on social media

Use of Natural Language Processing on posts made on Twitter and Reddit to predict depressive thoughts.

---

This notebook is written to be run both locally or on Google Colab.

## Setup for local run

- Download the root file as is.
- Install packages

In [None]:
# ! pip install pandas
# ! pip install numpy
# ! pip install nltk
# ! pip install pickle
# ! pip install keras
# ! pip install tqdm
# ! pip install dask
# ! pip install seaborn
# ! pip install wordcloud

## Setup for Google Colab

- Download this notebook and upload onto Google Colab
- Download the zip files (within /input) and upload into root directory of your Google Drive.

*You may download the [Sentiment140](https://www.kaggle.com/datasets/kazanova/sentiment140) directly from the source and replace the provided one (within /input). No edits were made to the data.*

In [None]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import random

from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

from re import sub
from time import time

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from dask.diagnostics import ProgressBar
import dask.dataframe as dd
ProgressBar().register()

In [None]:
def runLocally():

    import shutil
    import os
    import tensorflow as tf

    print("Running locally...")

    path = './build'

    if not os.path.isdir(path):
        for x in os.listdir('./input'):
            shutil.unpack_archive(f'./input/{x}', path)
            print(f"Extracted {x} into '{path}' directory")
    else:
        print(f"{path} directory already exists. Skipping extracting of zip files.")

    gpuCount = len(tf.config.list_physical_devices('GPU'))
    
    if gpuCount > 0:

        print(f"{gpuCount} GPUs detected.")

        if tf.test.is_built_with_cuda():
            print(f"Tensorflow has CUDA support.")

        if not tf.test.is_built_with_cuda():
            print("Tensorflow doesn't have CUDA support.")
    else:
        print("No GPUs detected on local device.")

    return path

def runOnColab():

    from google.colab import drive
    
    print("Running on Google Colab")
    
    drive.mount('/content/drive')

    !unzip "/content/drive/MyDrive/training.1600000.processed.noemoticon.csv.zip"
    !unzip "/content/drive/MyDrive/scrapped_posts.zip"

    return '/content'
    
directory = runLocally()

# Loading model from pre-trained

In [None]:
def loadModel(pathToModel, pathToPKL):
    with open(pathToPKL, 'rb') as f:
        tokeniser = pickle.load(f)
    return load_model(pathToModel), tokeniser

model, tokeniser = loadModel("./model.h5", "./tokenizer.pkl")

# General use functions

In [None]:
def removeInvalidRedditPost(df):
    df = df[df.Body.notna()]
    df = df[df.Author != "[removed]"]
    df = df[df.Body != "[removed]"]
    df = df[df.Author != "[deleted]"]
    df = df[df.Body != "[deleted]"]

    return df

def standardiseRedditDF(dff, sentimentValue=None):

    dff = removeInvalidRedditPost(dff)
    dff.rename(columns={'Author': 'user_id', 'Post_iD': 'id', 'Publish_date':'date', 'Body':'text'}, inplace=True)
    dff['text'] = dff['Title'].str.cat(dff['text'], sep=" ")
    dff = dff.drop(columns=['Score', 'Total_no_of_comments', 'Link', 'Subreddit', 'Title'])
    
    if sentimentValue != None:
        dff['sentiment'] = sentimentValue

    return dff

def preprocess(text):
  
    text = str(text).lower()
    
    urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    userPattern       = "@[^\s]+" 
    alphaPattern      = "[^a-zA-Z0-9]"

    text = sub(urlPattern, ' URL', text).strip()
    text = sub(userPattern, ' USER', text).strip()
    text = sub(alphaPattern, ' ', text).strip()
    
    stopWords = stopwords.words("english")
    tokens = list(filter(lambda x: x not in stopWords, text.split()))

    return " ".join(tokens)

def parellelPreProcess(df):
    ddf = dd.from_pandas(df, npartitions=4)
    ddf["text"] = ddf["text"].map(lambda x: preprocess(x), meta=('result', str))
    return ddf.compute()

def sentimentFromScore(score):
  score = float(score)
  label = 'Neutral'
  if score <= 0.35:
      label = 'Negative'
  elif score >= 0.65:
      label = 'Positive'

  return label

def predict(text, wantsTime=False):
  if wantsTime: 
      start_at = time()
  
  text = str(text)
  score = model.predict(pad_sequences(tokeniser.texts_to_sequences([text]), maxlen=300))

  result = {"label": sentimentFromScore(score), 
            "score": score}
  
  if wantsTime: 
      result["elapsedTime"] = time() - start_at

  return result

# Analysis on uncategorised subreddits

## Preparing data 

In [None]:
def processSubreddit(file, count = 0):
    df4 = pd.read_csv(f"{directory}/{file}")
    df4 = standardiseRedditDF(df4)
    df4 = parellelPreProcess(df4)

    if count > 0:
        if count <= df4.index.count():
            df4 = df4.head(count)
        
    ddf = dd.from_pandas(df4, npartitions=4)
    ddf["sentimentScore"] = ddf["text"].map(lambda x: float(predict(x)['score']), meta=('result', float))
    ddf["sentiment"] = ddf["sentimentScore"].map(lambda x: sentimentFromScore(x), meta=('result', str))
    df4 = ddf.compute()
    
    return df4

teenagersDF = processSubreddit('teenagers.csv', 4000)
teenagersDF.head()

## Analysis

In [None]:
def frequentPosterDF(df):
    
    resultDF = df.groupby('user_id').filter(lambda x : x['user_id'].shape[0]>=3)

    return resultDF

def changeOfSentimentOverTime(df, subset = 0):

    significantDiff = 0.4
    
    fig, axs = plt.subplots(2)
    fig.tight_layout()

    axs[0].set_title(f"Positive change")
    axs[0].legend().set_visible(False)
    axs[1].set_title(f"Negative change")
    axs[1].legend().set_visible(False)

    df = frequentPosterDF(df)
    names = df.user_id.unique()
    
    if subset > 0:
        if subset <= len(names):
            random.shuffle(names)
            names = names[0:subset]
    
    for name in names:
        singleUserDF = df[df["user_id"] == name].sort_values("date")

        diff = singleUserDF['sentimentScore'].iat[-1] - singleUserDF['sentimentScore'].iat[0]

        if diff >= significantDiff:
            singleUserDF.plot(x='date', y='sentimentScore', kind='line', figsize=(20,8), ax=axs[0])
        elif diff <= -significantDiff:
            singleUserDF.plot(x='date', y='sentimentScore', kind='line', figsize=(20,8), ax=axs[1])
        
    plt.show()

changeOfSentimentOverTime(teenagersDF)

In [None]:
def avgSentimentScoreOverPostingFreq(df, subset=0):
    
    fig, axs = plt.subplots(1)
    fig.tight_layout()
    axs.set_title(f"Avg sentimentScore against No. of Posts of user")
    axs.legend().set_visible(False)
    
    names = df.user_id.unique()

    if subset > 0:
        if subset <= len(names):
            random.shuffle(names)
            names = names[0:subset]
    
    tempdf = pd.DataFrame()
    tempdf['user_id'] = pd.DataFrame(names)
    
    tempListMean = []
    tempListCount = []

    for name in names:
        tempListMean.append(df[df['user_id'] == name]['sentimentScore'].mean())
        tempListCount.append(len(df[df['user_id'] == name].index))
    
    tempdf['avgSentimentScore'] = pd.DataFrame(tempListMean)
    tempdf['postFreq'] = pd.DataFrame(tempListCount)
    
    tempdf.plot(x='postFreq', y='avgSentimentScore', kind='hist', figsize=(20,8), ax=axs)
        
    plt.show()

avgSentimentScoreOverPostingFreq(teenagersDF)