In [1]:
import pandas as pd
import numpy as np
from empath import Empath
import liwc
from utils import get_liwc_labels, get_empath_labels
from tqdm import tqdm
tqdm.pandas()

In [2]:
df = pd.read_csv('../data/amazon_synthetic/music_preprocessed.tsv', sep='\t', index_col=0)

In [3]:
df.head()

Unnamed: 0,text,Y,C,T_proxy
0,"clever,inspired and moving. this is a great al...",0,0,1
1,keith green is a bit of legend in some christi...,0,1,1
2,buy the cd. do not buy the mp3 album. downlo...,1,0,1
3,"if you're looking for a meditative, contemplat...",1,0,1
4,this is his best cd of all. /good choice of ar...,1,1,1


In [16]:
parse, category_names = liwc.load_token_parser('/home/victorialin/Documents/liwc_dict/LIWC2015_English_Flat.dic')

In [17]:
category_names

['function',
 'pronoun',
 'ppron',
 'i',
 'we',
 'you',
 'shehe',
 'they',
 'ipron',
 'article',
 'prep',
 'auxverb',
 'adverb',
 'conj',
 'negate',
 'verb',
 'adj',
 'compare',
 'interrog',
 'number',
 'quant',
 'affect',
 'posemo',
 'negemo',
 'anx',
 'anger',
 'sad',
 'social',
 'family',
 'friend',
 'female',
 'male',
 'cogproc',
 'insight',
 'cause',
 'discrep',
 'tentat',
 'certain',
 'differ',
 'percept',
 'see',
 'hear',
 'feel',
 'bio',
 'body',
 'health',
 'sexual',
 'ingest',
 'drives',
 'affiliation',
 'achiev',
 'power',
 'reward',
 'risk',
 'focuspast',
 'focuspresent',
 'focusfuture',
 'relativ',
 'motion',
 'space',
 'time',
 'work',
 'leisure',
 'home',
 'money',
 'relig',
 'death',
 'informal',
 'swear',
 'netspeak',
 'assent',
 'nonflu',
 'filler']

In [5]:
df['label_count'] = df['text'].progress_apply(get_liwc_labels, args=(category_names, parse, True))

100%|██████████| 21290/21290 [00:08<00:00, 2515.29it/s]


In [6]:
df.head()

Unnamed: 0,text,Y,C,T_proxy,label_count
0,"clever,inspired and moving. this is a great al...",0,0,1,"[1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, ..."
1,keith green is a bit of legend in some christi...,0,1,1,"[1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, ..."
2,buy the cd. do not buy the mp3 album. downlo...,1,0,1,"[1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, ..."
3,"if you're looking for a meditative, contemplat...",1,0,1,"[1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, ..."
4,this is his best cd of all. /good choice of ar...,1,1,1,"[1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, ..."


In [7]:
count_df = pd.DataFrame(np.stack(df['label_count'].values, axis=0), columns=category_names)

In [8]:
count_df['text'] = df['text']
count_df['Y'] = df['Y']

In [13]:
count_df.columns

Index(['function', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they',
       'ipron', 'article', 'prep', 'auxverb', 'adverb', 'conj', 'negate',
       'verb', 'adj', 'compare', 'interrog', 'number', 'quant', 'affect',
       'posemo', 'negemo', 'anx', 'anger', 'sad', 'social', 'family', 'friend',
       'female', 'male', 'cogproc', 'insight', 'cause', 'discrep', 'tentat',
       'certain', 'differ', 'percept', 'see', 'hear', 'feel', 'bio', 'body',
       'health', 'sexual', 'ingest', 'drives', 'affiliation', 'achiev',
       'power', 'reward', 'risk', 'focuspast', 'focuspresent', 'focusfuture',
       'relativ', 'motion', 'space', 'time', 'work', 'leisure', 'home',
       'money', 'relig', 'death', 'informal', 'swear', 'netspeak', 'assent',
       'nonflu', 'filler', 'text', 'Y'],
      dtype='object')

count_df0 = count_df[df['C']==0]
count_df1 = count_df[df['C']==1]

In [11]:
count_df0.head()

Unnamed: 0,function,pronoun,ppron,i,we,you,shehe,they,ipron,article,...,relig,death,informal,swear,netspeak,assent,nonflu,filler,text,Y
0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"clever,inspired and moving. this is a great al...",0
2,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,buy the cd. do not buy the mp3 album. downlo...,1
3,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"if you're looking for a meditative, contemplat...",1
6,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,i just heard john michael talbot this past sun...,1
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"i have enjoyed these songs for decades, and ha...",1


In [12]:
count_df1.head()

Unnamed: 0,function,pronoun,ppron,i,we,you,shehe,they,ipron,article,...,relig,death,informal,swear,netspeak,assent,nonflu,filler,text,Y
1,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,keith green is a bit of legend in some christi...,0
4,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,this is his best cd of all. /good choice of ar...,1
5,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,excellent meditations. calms and focuses my s...,1
7,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,this is the best meditative and background mus...,0
11,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,i would recommend this album to any christian ...,1


In [14]:
    lexicon = Empath()
    category_names = list(lexicon.cats.keys())

In [15]:
category_names

['help',
 'office',
 'dance',
 'money',
 'wedding',
 'domestic_work',
 'sleep',
 'medical_emergency',
 'cold',
 'hate',
 'cheerfulness',
 'aggression',
 'occupation',
 'envy',
 'anticipation',
 'family',
 'vacation',
 'crime',
 'attractive',
 'masculine',
 'prison',
 'health',
 'pride',
 'dispute',
 'nervousness',
 'government',
 'weakness',
 'horror',
 'swearing_terms',
 'leisure',
 'suffering',
 'royalty',
 'wealthy',
 'tourism',
 'furniture',
 'school',
 'magic',
 'beach',
 'journalism',
 'morning',
 'banking',
 'social_media',
 'exercise',
 'night',
 'kill',
 'blue_collar_job',
 'art',
 'ridicule',
 'play',
 'computer',
 'college',
 'optimism',
 'stealing',
 'real_estate',
 'home',
 'divine',
 'sexual',
 'fear',
 'irritability',
 'superhero',
 'business',
 'driving',
 'pet',
 'childish',
 'cooking',
 'exasperation',
 'religion',
 'hipster',
 'internet',
 'surprise',
 'reading',
 'worship',
 'leader',
 'independence',
 'movement',
 'body',
 'noise',
 'eating',
 'medieval',
 'zest',
