In [4]:
HOME_DIR = "/home_remote"
HOME = "/home/thi.tra.my.nguyen"

from liwc import Liwc
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from nltk import word_tokenize, ngrams
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.model_selection import GridSearchCV
#import logisitic_regression
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
import textstat
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')



warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None

[nltk_data] Downloading package punkt to
[nltk_data]     /home/thi.tra.my.nguyen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/thi.tra.my.nguyen/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [5]:
positives_df_path = os.path.join(HOME_DIR, "positive_df.pkl")
negatives_df_path = os.path.join(HOME_DIR, "negative_df.pkl")

positives = pd.read_pickle(positives_df_path)
negatives = pd.read_pickle(negatives_df_path)
positives['Date'] = pd.to_datetime(positives['Date'])
negatives['Date'] = pd.to_datetime(negatives['Date'])

### Feature selection

In [6]:
def get_ngrams(text, n):
  n_grams = ngrams(word_tokenize(text), n)
  return [ ' '.join(grams) for grams in n_grams]

def frequency_distribution(grams, word):
    ls = []
    for i in grams:
        count = 0
        for j in i:
            if j == word:
                count += 1
        ls.append(count)
    return ls

In [7]:
#count number of words in pos_tagging
def count_word_pos_tagging(pos_tagging, word):
    count = 0
    for i in pos_tagging:
        if i[1] == word:
            count += 1
    return count

In [8]:
#count number of words in pos_tagging but word in the list
def count_word_list(tokens, list_word):
    count = 0
    for i in tokens:
        if i in list_word:
            count += 1
    return count

In [9]:
def construct_liwc_input_crafted(df, label):

  df['text'] = df['Text']+ df['Title']
  df['Token'] = df['text'].apply(lambda x: word_tokenize(x))
  df['AVG_SEN'] = df['text'].apply(lambda x:  textstat.avg_sentence_length(x))
  df['AVG_PER_WORD'] = df['text'].apply(lambda x:  textstat.avg_letter_per_word(x))
  df['LWF'] = df['text'].apply(lambda x: textstat.linsear_write_formula(x))
  df['FRE'] = df['text'].apply(lambda x: textstat.flesch_reading_ease(x))
  df['DCR'] = df['text'].apply(lambda x: textstat.dale_chall_readability_score(x))
  df['FOG'] = df['text'].apply(lambda x: textstat.gunning_fog(x))
  

  #count number of I, my depression, my anxiety, in the text
  bigrams = df['Text'].apply(lambda x: get_ngrams(x, 2))
  unigrams = df['Text'].apply(lambda x: get_ngrams(x, 1))
  unigrams_title = df['Title'].apply(lambda x: get_ngrams(x, 1))
  depression = frequency_distribution(bigrams, 'my depression')
  anxiety = frequency_distribution(bigrams, 'my anxiety')
  therapist = frequency_distribution(bigrams, 'my therapist')
  count_I = frequency_distribution(unigrams, 'I')
  count_I_title = frequency_distribution(unigrams_title, 'I')
  df['My_Therapist'] = therapist
  df['My_Depression'] = depression
  df['My_Anxiety'] = anxiety
  df['word_I'] = count_I
  df['word_I_title'] = count_I_title
  #count if unigrams contain "Zoloft", "Celexa", "Lexapro", "Paxil", "Pexeva", "Brisdelle", "Luvox"
  antidepression = ["Zoloft", "Celexa", "Lexapro", "Paxil", "Pexeva", "Brisdelle", "Luvox"]
  df['Antidepressants'] = df['Token'].apply(lambda x: count_word_list(x, antidepression))



  #return boolean if the text contains "I was diagnosed with depression" or "I was diagnosed with anxiety" or "I've been diagnosed with depression"
  df['Diagnosed_Depression'] = df['Text'].apply(lambda x: 1 if 'I was diagnosed with depression' in x or 'I was diagnosed with anxiety' in x or 'I\'ve been diagnosed with depression' in x else 0)

  #POS tagging to count number of possessive pronouns, personal pronouns, past tense verbs.
  temp = df['Token'].apply(lambda x: nltk.pos_tag(x))
  df['POS'] = [count_word_pos_tagging(i, 'PRP$') for i in temp]
  df['PRP'] = [count_word_pos_tagging(i, 'PRP') for i in temp]
  df['VBD'] = [count_word_pos_tagging(i, 'VBD') for i in temp]

  # calculate avergae length of word of title per user
  df['Length_Title'] = df['Title'].apply(lambda x: len(word_tokenize(x)))

  #get month of each writing
  df['Month'] = df['Date'].apply(lambda x: x.month)
  #get hour of each writing
  df['Hour'] = df['Date'].apply(lambda x: x.hour)


  result_df = df.groupby('TrainSubjectId').agg({'POS':'mean', 'PRP':'mean', 'VBD':'mean','Length_Title': 'mean', 'Month':'mean','Hour':'mean','LWF': 'mean', 'FRE': 'mean', 'DCR': 'mean', 'FOG': 'mean','AVG_SEN':'mean', 'AVG_PER_WORD': 'mean','My_Depression':'sum','My_Anxiety':'sum','My_Therapist':'sum','word_I':'mean','word_I_title':'mean','Diagnosed_Depression':'sum' ,'Antidepressants':'sum','Text':'count'}).reset_index()
  result_df["Label"] = label
 
  #join text per user
  joined_text_df = df.groupby('TrainSubjectId')['text'].apply(' '.join).reset_index()
  result_df = result_df.merge(joined_text_df, on="TrainSubjectId")

  # number_of_writings_df = df.groupby('TrainSubjectId')['Text'].apply(lambda x: len(x)).reset_index()
  result_df.rename(columns={'Text': 'NumOfWritings'}, inplace=True)

  # #merge number of writings and result_df on trainSubjectId
  # result_df_final = result_df.merge(number_of_writings_df, on="TrainSubjectId")
  
  return result_df

In [11]:
positives_liwc = construct_liwc_input_crafted(positives, 1)

In [10]:
negatives_liwc = construct_liwc_input_crafted(negatives, 0)

In [12]:
#concatenate two dataframes
liwc_df = pd.concat([positives_liwc, negatives_liwc], ignore_index=True)

In [79]:
liwc_df.to_pickle(os.path.join(HOME_DIR, "liwc_df_full_crafted.pkl"))

In [80]:
liwc_df.columns

Index(['TrainSubjectId', 'POS', 'PRP', 'VBD', 'Length_Title', 'Month', 'Hour',
       'LWF', 'FRE', 'DCR', 'FOG', 'AVG_SEN', 'AVG_PER_WORD', 'My_Depression',
       'My_Anxiety', 'My_Therapist', 'word_I', 'word_I_title',
       'Diagnosed_Depression', 'Antidepressants', 'NumOfWritings', 'Label',
       'text'],
      dtype='object')

In [123]:
#extract features and handcrafted features
def get_features_crafted(df,output, type):
    hand_crafted = [
        'POS', 'PRP', 'VBD', 'Length_Title', 'Month', 'Hour',
       'LWF', 'FRE', 'DCR', 'FOG', 'AVG_SEN', 'AVG_PER_WORD', 'My_Depression',
       'My_Anxiety', 'My_Therapist', 'word_I', 'word_I_title',
       'Diagnosed_Depression', 'Antidepressants', 'NumOfWritings']
    
    relevant_features_name ={'liwc': ['i', 'friend', 'sad', 'family', 'feel', 'health',
       'sexual', 'anx', 'body', 'bio', 'ppron', 'filler', 'shehe', 'adverb',
       'swear', 'humans', 'excl', 'assent', 'discrep', 'you', 'pronoun',
       'negemo', 'past'],
                        'liwc_alike': ['Anxiety', 'I', 'Sadness', 'Affective Processes',
       'Sexuality', 'Family', 'Friends', 'Fillers', 'Health', 'Feeling',
       'Humans', 'Biological Processes', 'Time', 'Body', 'Negative Emotions',
       'Social Processes', 'Perceptual Processes', 'Insight',
       'Cognitive Processes', 'Motion', 'Positive Emotions', 'Tentative',
       'Ppronouns']}
    vector_df = pd.DataFrame(output, index=df.index)
    #vector_df_norm = vector_df.div(vector_df.sum(axis=1), axis=0)
    #vector_df_norm['Label'] = df['Label']
    #vector_df_norm['TrainSubjectId'] = df['TrainSubjectId']
    vector_df= vector_df.fillna(0)
    #corr = vector_df_norm.corr()
    #corr_label = corr['Label'].sort_values(ascending=False)
    #relevant_features = corr_label[1:15]
    #relevant_features_name = relevant_features.index.values
    re = vector_df[relevant_features_name[type]]
    for i in hand_crafted:
        re[i] = df[i]
    X = re
    y = df['Label']
    return X, y

### Run through LIWC

In [15]:
# Load LIWC dictionary
liwc = Liwc(os.path.join(HOME_DIR, "master_thesis/LIWC2007_English100131.dic"))
input = [liwc.parse(word_tokenize(text)) for text in liwc_df['text']]

In [16]:
# Load LIWC-alike dictionary
%run /home_remote/master_thesis/model_evaluation/liwc_alike.py
liwc_alike_output = [main(text, result) for text in liwc_df['text']]

### Get features

In [124]:
X_alike, y_alike = get_features_crafted(liwc_df, liwc_alike_output, 'liwc_alike')

In [126]:
#standardize data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_alike_norm = scaler.fit_transform(X_alike)


In [131]:
joblib.dump(scaler, os.path.join(HOME_DIR, "scaler_alike.pkl"))

['/home_remote/scaler_alike.pkl']

In [135]:
model = LogisticRegression(C=64, penalty='l1', solver='liblinear', class_weight={0: 0.2, 1: 0.8})
model.fit(X_alike_norm, y_alike)
y_alike_pred = model.predict(X_alike_norm)
print(classification_report(y_alike, y_alike_pred))

              precision    recall  f1-score   support

           0       0.96      0.87      0.91       403
           1       0.57      0.84      0.68        83

    accuracy                           0.86       486
   macro avg       0.77      0.86      0.80       486
weighted avg       0.90      0.86      0.87       486



In [136]:
print(f1_score(y_alike, y_alike_pred))

0.6796116504854369


In [130]:
joblib.dump(model, os.path.join(HOME_DIR, "liwc_alike_full_crafted.pkl"))

['/home_remote/liwc_alike_full_crafted.pkl']

In [132]:
#for LIWC
X_liwc, y_liwc = get_features_crafted(liwc_df, input, 'liwc')

In [142]:
#standardize data
from sklearn.preprocessing import StandardScaler
scaler2 = StandardScaler()
X_liwc_norm = scaler2.fit_transform(X_liwc)


In [145]:
joblib.dump(scaler2, os.path.join(HOME_DIR, "scaler_liwc.pkl"))

['/home_remote/scaler_liwc.pkl']

In [143]:
mod= LogisticRegression(C=64, penalty='l1', solver='liblinear', class_weight={0: 0.2, 1: 0.8})
mod.fit(X_liwc_norm, y_liwc)
y_liwc_pred = mod.predict(X_liwc_norm)
print(classification_report(y_liwc, y_liwc_pred))
print(f1_score(y_liwc, y_liwc_pred))

              precision    recall  f1-score   support

           0       0.97      0.86      0.91       403
           1       0.56      0.86      0.68        83

    accuracy                           0.86       486
   macro avg       0.76      0.86      0.79       486
weighted avg       0.90      0.86      0.87       486

0.6761904761904762


In [144]:
joblib.dump(mod, os.path.join(HOME_DIR, "liwc_full_crafted.pkl"))

['/home_remote/liwc_full_crafted.pkl']

### Get 10 features of LIWC


In [13]:
def get_features_crafted_10(df,output, type):
    hand_crafted = [
        'POS', 'PRP', 'VBD', 'Length_Title', 'Month', 'Hour',
       'LWF', 'FRE', 'DCR', 'FOG', 'AVG_SEN', 'AVG_PER_WORD', 'My_Depression',
       'My_Anxiety', 'My_Therapist', 'word_I', 'word_I_title',
       'Diagnosed_Depression', 'Antidepressants', 'NumOfWritings']
    
    relevant_features_name ={'liwc': ['i', 'friend', 'sad','sexual', 'anx','ppron', 'discrep', 'pronoun','negemo', 'past'],
                        'liwc_alike': ['Anxiety', 'I', 'Sadness', 'Negative Emotions','Social Processes', 'Insight','Cognitive Processes', 'Motion', 'Positive Emotions','Ppronouns']}
    vector_df = pd.DataFrame(output, index=df.index)
    #vector_df_norm = vector_df.div(vector_df.sum(axis=1), axis=0)
    #vector_df_norm['Label'] = df['Label']
    #vector_df_norm['TrainSubjectId'] = df['TrainSubjectId']
    vector_df= vector_df.fillna(0)
    #corr = vector_df_norm.corr()
    #corr_label = corr['Label'].sort_values(ascending=False)
    #relevant_features = corr_label[1:15]
    #relevant_features_name = relevant_features.index.values
    re = vector_df[relevant_features_name[type]]
    for i in hand_crafted:
        re[i] = df[i]
    X = re
    y = df['Label']
    return X, y

In [17]:
X_alike_10, y_alike_10 = get_features_crafted_10(liwc_df, liwc_alike_output, 'liwc_alike')

In [18]:
#standardize data
from sklearn.preprocessing import StandardScaler
scaler_alike_10= StandardScaler()
X_alike_norm_10 = scaler_alike_10.fit_transform(X_alike_10)

In [29]:
mod_alike_10 = LogisticRegression(C=64, penalty='l2', class_weight={0: 1/3, 1: 2/3})
mod_alike_10.fit(X_alike_norm_10, y_alike_10)
y_alike_pred_10 = mod_alike_10.predict(X_alike_norm_10)
print(classification_report(y_alike_10, y_alike_pred_10))
print(f1_score(y_alike_10, y_alike_pred_10))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94       403
           1       0.71      0.66      0.69        83

    accuracy                           0.90       486
   macro avg       0.82      0.80      0.81       486
weighted avg       0.89      0.90      0.90       486

0.6875000000000001


In [33]:
joblib.dump(mod_alike_10, os.path.join(HOME_DIR, "liwc_alike_10_full_crafted.pkl"))

['/home_remote/liwc_alike_10_full_crafted.pkl']

In [31]:
joblib.dump(scaler_alike_10, os.path.join(HOME_DIR, "scaler_alike_10.pkl"))

['/home_remote/scaler_alike_10.pkl']

In [30]:
X_liwc_10, y_liwc_10 = get_features_crafted_10(liwc_df, input, 'liwc')
scaler_liwc_10 = StandardScaler()
X_liwc_norm_10 = scaler_liwc_10.fit_transform(X_liwc_10)
mod_liwc_10 = LogisticRegression(C=64, penalty='l2', class_weight={0: 1/3, 1: 2/3})
mod_liwc_10.fit(X_liwc_norm_10, y_liwc_10)
y_liwc_pred_10 = mod_liwc_10.predict(X_liwc_norm_10)
print(classification_report(y_liwc_10, y_liwc_pred_10))
print(f1_score(y_liwc_10, y_liwc_pred_10))

              precision    recall  f1-score   support

           0       0.92      0.94      0.93       403
           1       0.67      0.60      0.63        83

    accuracy                           0.88       486
   macro avg       0.79      0.77      0.78       486
weighted avg       0.88      0.88      0.88       486

0.6329113924050633


In [32]:
joblib.dump(scaler_liwc_10, os.path.join(HOME_DIR, "scaler_liwc_10.pkl"))
joblib.dump(mod_liwc_10, os.path.join(HOME_DIR, "liwc_10_full_crafted.pkl"))

['/home_remote/liwc_10_full_crafted.pkl']