In [2]:
HOME_DIR = "/home_remote"
HOME = "/home/thi.tra.my.nguyen"

from liwc import Liwc
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from nltk import word_tokenize, ngrams
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.model_selection import GridSearchCV
#import logisitic_regression
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
import textstat
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')



warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None

In [40]:
positives_df_path = os.path.join(HOME_DIR, "positive_df.pkl")
negatives_df_path = os.path.join(HOME_DIR, "negative_df.pkl")

positives = pd.read_pickle(positives_df_path)
negatives = pd.read_pickle(negatives_df_path)
positives['Date'] = pd.to_datetime(positives['Date'])
negatives['Date'] = pd.to_datetime(negatives['Date'])

### Feature selection

In [67]:
def get_ngrams(text, n):
  n_grams = ngrams(word_tokenize(text), n)
  return [ ' '.join(grams) for grams in n_grams]

def frequency_distribution(grams, word):
    ls = []
    for i in grams:
        count = 0
        for j in i:
            if j == word:
                count += 1
        ls.append(count)
    return ls

In [16]:
#count number of words in pos_tagging
def count_word_pos_tagging(pos_tagging, word):
    count = 0
    for i in pos_tagging:
        if i[1] == word:
            count += 1
    return count

In [57]:
#count number of words in pos_tagging but word in the list
def count_word_list(tokens, list_word):
    count = 0
    for i in tokens:
        if i in list_word:
            count += 1
    return count

In [68]:
def construct_liwc_input_crafted(df, label):

  df['text'] = df['Text']+ df['Title']
  df['Token'] = df['text'].apply(lambda x: word_tokenize(x))
  df['AVG_SEN'] = df['text'].apply(lambda x:  textstat.avg_sentence_length(x))
  df['AVG_PER_WORD'] = df['text'].apply(lambda x:  textstat.avg_letter_per_word(x))
  df['LWF'] = df['text'].apply(lambda x: textstat.linsear_write_formula(x))
  df['FRE'] = df['text'].apply(lambda x: textstat.flesch_reading_ease(x))
  df['DCR'] = df['text'].apply(lambda x: textstat.dale_chall_readability_score(x))
  df['FOG'] = df['text'].apply(lambda x: textstat.gunning_fog(x))
  

  #count number of I, my depression, my anxiety, in the text
  bigrams = df['Text'].apply(lambda x: get_ngrams(x, 2))
  unigrams = df['Text'].apply(lambda x: get_ngrams(x, 1))
  unigrams_title = df['Title'].apply(lambda x: get_ngrams(x, 1))
  depression = frequency_distribution(bigrams, 'my depression')
  anxiety = frequency_distribution(bigrams, 'my anxiety')
  therapist = frequency_distribution(bigrams, 'my therapist')
  count_I = frequency_distribution(unigrams, 'I')
  count_I_title = frequency_distribution(unigrams_title, 'I')
  df['My_Therapist'] = therapist
  df['My_Depression'] = depression
  df['My_Anxiety'] = anxiety
  df['word_I'] = count_I
  df['word_I_title'] = count_I_title
  #count if unigrams contain "Zoloft", "Celexa", "Lexapro", "Paxil", "Pexeva", "Brisdelle", "Luvox"
  antidepression = ["Zoloft", "Celexa", "Lexapro", "Paxil", "Pexeva", "Brisdelle", "Luvox"]
  df['Antidepressants'] = df['Token'].apply(lambda x: count_word_list(x, antidepression))



  #return boolean if the text contains "I was diagnosed with depression" or "I was diagnosed with anxiety" or "I've been diagnosed with depression"
  df['Diagnosed_Depression'] = df['Text'].apply(lambda x: 1 if 'I was diagnosed with depression' in x or 'I was diagnosed with anxiety' in x or 'I\'ve been diagnosed with depression' in x else 0)

  #POS tagging to count number of possessive pronouns, personal pronouns, past tense verbs.
  temp = df['Token'].apply(lambda x: nltk.pos_tag(x))
  df['POS'] = [count_word_pos_tagging(i, 'PRP$') for i in temp]
  df['PRP'] = [count_word_pos_tagging(i, 'PRP') for i in temp]
  df['VBD'] = [count_word_pos_tagging(i, 'VBD') for i in temp]

  # calculate avergae length of word of title per user
  df['Length_Title'] = df['Title'].apply(lambda x: len(word_tokenize(x)))

  #get month of each writing
  df['Month'] = df['Date'].apply(lambda x: x.month)
  #get hour of each writing
  df['Hour'] = df['Date'].apply(lambda x: x.hour)


  result_df = df.groupby('TrainSubjectId').agg({'POS':'mean', 'PRP':'mean', 'VBD':'mean','Length_Title': 'mean', 'Month':'mean','Hour':'mean','LWF': 'mean', 'FRE': 'mean', 'DCR': 'mean', 'FOG': 'mean','AVG_SEN':'mean', 'AVG_PER_WORD': 'mean','My_Depression':'sum','My_Anxiety':'sum','My_Therapist':'sum','word_I':'mean','word_I_title':'mean','Diagnosed_Depression':'sum' ,'Antidepressants':'sum','Text':'count'}).reset_index()
  result_df["Label"] = label
 
  #join text per user
  joined_text_df = df.groupby('TrainSubjectId')['text'].apply(' '.join).reset_index()
  result_df = result_df.merge(joined_text_df, on="TrainSubjectId")

  # number_of_writings_df = df.groupby('TrainSubjectId')['Text'].apply(lambda x: len(x)).reset_index()
  result_df.rename(columns={'Text': 'NumOfWritings'}, inplace=True)

  # #merge number of writings and result_df on trainSubjectId
  # result_df_final = result_df.merge(number_of_writings_df, on="TrainSubjectId")
  
  return result_df

In [69]:
positives_liwc = construct_liwc_input_crafted(positives, 1)

In [71]:
negatives_liwc = construct_liwc_input_crafted(negatives, 0)

In [78]:
#concatenate two dataframes
liwc_df = pd.concat([positives_liwc, negatives_liwc], ignore_index=True)

In [79]:
liwc_df.to_pickle(os.path.join(HOME_DIR, "liwc_df_full_crafted.pkl"))

In [80]:
liwc_df.columns

Index(['TrainSubjectId', 'POS', 'PRP', 'VBD', 'Length_Title', 'Month', 'Hour',
       'LWF', 'FRE', 'DCR', 'FOG', 'AVG_SEN', 'AVG_PER_WORD', 'My_Depression',
       'My_Anxiety', 'My_Therapist', 'word_I', 'word_I_title',
       'Diagnosed_Depression', 'Antidepressants', 'NumOfWritings', 'Label',
       'text'],
      dtype='object')

In [None]:
#extract features and handcrafted features
def get_features_crafted(df,output, type):
    hand_crafted = [
        'POS', 'PRP', 'VBD', 'Length_Title', 'Month', 'Hour',
       'LWF', 'FRE', 'DCR', 'FOG', 'AVG_SEN', 'AVG_PER_WORD', 'My_Depression',
       'My_Anxiety', 'My_Therapist', 'word_I', 'word_I_title',
       'Diagnosed_Depression', 'Antidepressants', 'NumOfWritings']
    
    relevant_features_name ={'liwc': ['i', 'friend', 'sad', 'family', 'feel', 'health',
       'sexual', 'anx', 'body', 'bio', 'ppron', 'filler', 'shehe', 'adverb',
       'swear', 'humans', 'excl', 'assent', 'discrep', 'you', 'pronoun',
       'negemo', 'past'],
                        'liwc_alike': ['Anxiety', 'I', 'Sadness', 'Affective Processes',
       'Sexuality', 'Family', 'Friends', 'Fillers', 'Health', 'Feeling',
       'Humans', 'Biological Processes', 'Time', 'Body', 'Negative Emotions',
       'Social Processes', 'Perceptual Processes', 'Insight',
       'Cognitive Processes', 'Motion', 'Positive Emotions', 'Tentative',
       'Ppronouns']}
    vector_df = pd.DataFrame(output, index=df.index)
    #vector_df_norm = vector_df.div(vector_df.sum(axis=1), axis=0)
    #vector_df_norm['Label'] = df['Label']
    #vector_df_norm['TrainSubjectId'] = df['TrainSubjectId']
    vector_df= vector_df.fillna(0)
    #corr = vector_df_norm.corr()
    #corr_label = corr['Label'].sort_values(ascending=False)
    #relevant_features = corr_label[1:15]
    #relevant_features_name = relevant_features.index.values
    re = vector_df[relevant_features_name[type]]
    for i in hand_crafted:
        re[i] = df[i]
    X = re
    y = df['Label']
    return X, y

In [88]:
def get_features_top10(df, output):
    #df['vector'] = output
    vector_df = pd.DataFrame(output, index=df.index)
    #vector_df_norm = vector_df.div(vector_df.sum(axis=1), axis=0)
    vector_df['Label'] = df['Label']
    vector_df['TrainSubjectId'] = df['TrainSubjectId']
    vector_df = vector_df.fillna(0)
    corr = vector_df.corr()
    corr_label = corr['Label'].sort_values(ascending=False)
    relevant_features = corr_label[1:11]
    relevant_features_name = relevant_features.index.values
    X = vector_df[relevant_features_name]
    #y = vector_df['Label']
    return X

In [None]:
positives[]

In [89]:
top10 = (liwc_df, input)

In [94]:
top10

(        TrainSubjectId       POS       PRP       VBD  Length_Title     Month  \
 0    train_subject1095  0.254178  1.768690  0.758135      0.640281  6.037819   
 1    train_subject1190  1.215488  5.666667  1.760943      1.424242  3.370370   
 2    train_subject1191  1.562500  8.562500  2.187500      0.000000  5.000000   
 3    train_subject1199  1.915584  8.357143  1.733766      1.077922  7.253247   
 4    train_subject1201  0.917923  3.443886  0.720268      0.405360  7.812395   
 ..                 ...       ...       ...       ...           ...       ...   
 481  train_subject9940  1.162791  6.837209  4.209302      5.325581  4.767442   
 482  train_subject9956  0.247179  1.573348  0.493283      4.382590  5.829124   
 483  train_subject9958  0.913717  4.714602  1.704277      3.325959  6.312684   
 484  train_subject9972  0.354545  2.892121  1.917576      8.635152  5.478182   
 485  train_subject9974  0.852833  3.152318  0.721854      2.811626  3.174393   
 
           Hour       LWF 

### Run through LIWC

In [85]:
# Load LIWC dictionary
liwc = Liwc(os.path.join(HOME_DIR, "master_thesis/LIWC2007_English100131.dic"))
input = [liwc.parse(word_tokenize(text)) for text in liwc_df['text']]

In [87]:
# Load LIWC-alike dictionary
%run /home_remote/master_thesis/model_evaluation/liwc_alike.py
liwc_alike_output = [main(text, result) for text in liwc_df['text']]