In [9]:
HOME_DIR = "/home_remote"
HOME = "/home/thi.tra.my.nguyen"

from liwc import Liwc
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from nltk import word_tokenize, ngrams

In [2]:
positives_df_path = os.path.join(HOME_DIR, "positive_df.pkl")
negatives_df_path = os.path.join(HOME_DIR, "negative_df.pkl")

positives = pd.read_pickle(positives_df_path)
negatives = pd.read_pickle(negatives_df_path)

In [21]:
def construct_liwc_input(df, label):
  """
  params: df - The positive/negative dataframe loaded from pickle
    The df is expected to has these columns "Title", "Date", "Text", "SubjectId"
  params: label - The label need to be assigned to result dataframe

  returns: A dataframe contains "SubjectId", "AverageLength", "Text", "NumOfWritings", "Title"
  """
  subject_id_list = df.loc[:, "TrainSubjectId"].unique()
  df["Token"] = df["Text"].apply(lambda x: word_tokenize(x))

  df['text'] = df['Text']+ df['Title']

  grouped_by_subject_id = df.groupby('TrainSubjectId')

  # calculate average token length for each user
  average_length_df = grouped_by_subject_id['Token'].apply(lambda token_series: sum(len(token) for token in token_series) / len(token_series)).reset_index()
  average_length_df.rename(columns={'Token': 'AverageLength'}, inplace=True)
  #print(average_length_df.head())

  # join all writings of single user into single corpus
  joined_text_df = grouped_by_subject_id['text'].apply(' '.join).reset_index()

  # calculate number of writings for each user
  number_of_writings_df = grouped_by_subject_id['Text'].apply(lambda x: len(x)).reset_index()
  number_of_writings_df.rename(columns={'Text': 'NumOfWritings'}, inplace=True)

  result_df = average_length_df.merge(joined_text_df, on="TrainSubjectId")
  result_df = result_df.merge(number_of_writings_df, on="TrainSubjectId")
  result_df["Label"] = label

  return result_df

In [22]:
# construct liwc input for positive and negative dataframe
input_positives = construct_liwc_input(positives, 1)
input_negatives = construct_liwc_input(negatives, 0)

In [93]:
#concatenate two dataframe and shuffle
liwc_input = pd.concat([input_positives, input_negatives])
liwc_input = liwc_input.sample(frac=1).reset_index(drop=True)

In [94]:
liwc_input

Unnamed: 0,TrainSubjectId,AverageLength,text,NumOfWritings,Label
0,train_subject4988,61.583333,damn we have a lot of fucking nitwits El...,72,0
1,train_subject7367,39.826087,what the ever loving fuck in light of re...,23,1
2,train_subject9763,101.767606,What's your best natural beauty \n\nThat amaz...,142,1
3,train_subject3804,214.666667,I've heard a of combo with Warlock's Floating...,24,0
4,train_subject2892,28.214286,quarkel and obama but the obama is hiding in ...,14,0
...,...,...,...,...,...
481,train_subject7135,59.025845,I c-can't. It's too powerful. My brain d...,503,1
482,train_subject5469,30.392354,This video contains content from NBC Universa...,1988,0
483,train_subject8344,30.027886,Well it's not as if they really need to try t...,1793,0
484,train_subject3236,13.728000,hahaha looool glad to hear that! yeah pi...,125,0


### LIWC Features

In [30]:
# Load LIWC dictionary
liwc = Liwc(os.path.join(HOME_DIR, "master_thesis/LIWC2007_English100131.dic"))
input = [liwc.parse(word_tokenize(text)) for text in liwc_input['text']]

In [98]:
liwc_input['Vector'] = input

In [77]:
liwc_input

Unnamed: 0,TrainSubjectId,AverageLength,text,NumOfWritings,Label,Vector
0,train_subject4769,156.193548,Pretty great Vietnamese. Their pho is great! ...,155,0,"{'verb': 36, 'present': 22, 'motion': 4, 'rela..."
1,train_subject8395,20.942857,Thanks a lot. That was done on a Nikon D7000 ...,105,0,"{'verb': 1723, 'funct': 6567, 'auxverb': 894, ..."
2,train_subject7515,48.000000,Title says it all. No harm in 3 years aside f...,59,1,"{'funct': 158, 'conj': 20, 'cogmech': 37, 'cau..."
3,train_subject3615,65.838812,Waco officials seek to quash subpoena issue...,1886,0,"{'funct': 9369, 'pronoun': 1677, 'ipron': 927,..."
4,train_subject6847,58.819767,It is unlikely that it would have been much o...,172,0,"{'verb': 5247, 'funct': 22512, 'auxverb': 3253..."
...,...,...,...,...,...,...
481,train_subject9536,38.312500,Join the Freemasons Sundae fundae ...,16,0,"{'relativ': 351, 'space': 206, 'verb': 339, 'f..."
482,train_subject1190,63.427609,"I'm sure I'll get flamed for this, but true f...",297,1,"{'work': 20, 'funct': 186, 'preps': 49, 'affec..."
483,train_subject7135,59.025845,I c-can't. It's too powerful. My brain d...,503,1,"{'verb': 400, 'funct': 1447, 'auxverb': 226, '..."
484,train_subject5366,11.363636,In the new Project Brutality there is option ...,11,0,"{'funct': 23941, 'pronoun': 4673, 'ppron': 275..."


In [99]:
#add a key-value to the counter
def add_to_counter(counter, key, value):
    counter[key] = value
    return counter

#add AverageLength and NumOfWritings to the counter
vector = liwc_input['Vector']
average_length = liwc_input['AverageLength']
num_of_writings = liwc_input['NumOfWritings']

vector_avg = [add_to_counter(vector[i], 'AverageLength', average_length[i]) for i in range(len(vector))]
vector_num = [add_to_counter(vector[i], 'NumOfWritings', num_of_writings[i]) for i in range(len(vector))]

In [100]:
vector_num[0]

Counter({'verb': 36,
         'present': 22,
         'motion': 4,
         'relativ': 27,
         'funct': 163,
         'adverb': 15,
         'preps': 48,
         'space': 12,
         'article': 33,
         'cogmech': 54,
         'discrep': 9,
         'conj': 15,
         'incl': 9,
         'insight': 6,
         'cause': 7,
         'inhib': 4,
         'negate': 11,
         'excl': 15,
         'affect': 10,
         'negemo': 3,
         'anger': 3,
         'pronoun': 26,
         'ipron': 11,
         'auxverb': 16,
         'future': 2,
         'social': 25,
         'quant': 5,
         'tentat': 13,
         'time': 13,
         'past': 11,
         'posemo': 7,
         'ppron': 15,
         'they': 2,
         'leisure': 6,
         'work': 3,
         'percept': 5,
         'hear': 3,
         'you': 10,
         'filler': 2,
         'humans': 2,
         'bio': 5,
         'ingest': 2,
         'family': 1,
         'shehe': 2,
         'body': 2,
         'swe