In [None]:
from empath import Empath
import os
import sys
from collections import defaultdict
import requests
import json
from collections import defaultdict
import os
import pandas as pd
import re
from tqdm import tqdm
tqdm.pandas()

In [None]:
dir_root = './'

In [None]:
class Empathm(Empath): # we extend the original Empath code to be able to load our own category file
    def __init__(self, categories_file="./data/categories.tsv",user_dir="./data/user/"):
        self.cats = defaultdict(list)
        self.inv_cache = {}
        self.load(categories_file)
        for f in os.listdir(user_dir):
            if len(f.split(".")) > 1 and f.split(".")[1] == "empath":
                self.load(user_dir+f)

        self.categories = self.cats.keys()
        self.invcats = defaultdict(list)
        key = tuple(sorted(self.categories))
        if key in self.inv_cache:
            self.invcats = self.inv_cache[key]
        else:
            for k in self.categories:
                for t in self.cats[k]: 
                    self.invcats[t].append(k)
            self.inv_cache[key] = self.invcats
                                
    def load(self,file):
        with open(file,"r") as f:
            for line in f:
                cols = line.strip().split("\t")
                name = cols[0]
                terms = cols[1:]
                for t in set(terms):
                    self.cats[name].append(t)
                    
                                 
    def analyzePrefix(self,doc,categories=None,tokenizer="default",normalize=False):  # we extend the original analyze method to consider prefixes instead of full matchings

        count = {}
        tokens = 0.0
        for cat in self.categories: 
            count[cat] = 0.0
        for tk in doc.split():
            tokens += 1.0
            for cat in self.invcats[tk]:
                count[cat]+=1.0
        if normalize:
            for cat in count.keys():
                if tokens == 0:
                    return None
                else:
                    count[cat] = count[cat] / tokens
        return count

In [None]:
def process_emphat(root,file,lexicon,name):
    
    if os.path.exists(root+"/empath_"+name+file):
        return
    
    if file.endswith('csv'):
        df = pd.read_csv(root+"/"+file).dropna()
    else:
        df = pd.read_pickle(root+'/'+file).dropna()
    
    if len(df) == 0:
        return
    
    print("Processing",file)
    
    listi = []
    for index, row in tqdm(df.iterrows()): 
        d = lexicon.analyzePrefix(row['text'], normalize=True)
        d =  {key:val for key, val in d.items() if val > 0}
        if len(d) > 0: # no need to store empty texts
            d['tweetId'] = row['tweetId']
            listi.append(d)

    cats = pd.DataFrame(listi)
    cats = cats.reindex(sorted(cats.columns), axis=1)
    if 'tweetId' in cats:
        col = cats.pop("tweetId")
        cats.insert(0, col.name, col) # in place
    cats.to_csv(root+"/empath_"+name+file,index=False,encoding='utf8')

def loop_process_emphat(datapath,prefix,user_path='./data/users/',categories_file="./data/es_categories.tsv",name=''):
    
    lexicon = Empathm(user_dir=user_path,categories_file=categories_file)
    
    path = os.walk(datapath)
    for root, _, files in path:
        for file in tqdm(files):
            if file.startswith(prefix):
                process_emphat(root,file,lexicon,name)

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw')  # Open Multilingual WordNet
nltk.download('punkt') # tokenizer
from nltk.corpus import wordnet as wn

from nltk.tokenize import word_tokenize
from collections import Counter

In [None]:
import xml.etree.ElementTree as ET

sentiSensePath = dir_root + '/data/SentiSense_Synsets_ES_30.xml' # not a resource created as part of this projects, need to be downloaded
tree = ET.parse(sentiSensePath)
root = tree.getroot()

sentiSenseDict={}

for child in root:
    key = child.attrib.get('synset')[4:]
    key = key[:-2]
    sentiSenseDict[key] = child.attrib.get('emotion')

import csv

wordnetLAS = dir_root + '/data/wn-data-las.csv'
reader = csv.DictReader(open(wordnetLAS,encoding='utf8'))

wordnetLASDict = {}

for row in reader:
    if row.get('SYNSET_OFFSET') in sentiSenseDict:
        value = row.get('SYNSET_OFFSET')+"-"+row.get('POS_TAG')
        wordnetLASDict[row.get('LEMMA')] = value.lower()

In [None]:
def senti_wordnet(root,file,column='text'): 
    
    list_emotions = []
    if file.endswith('csv'):
        df = pd.read_csv(root+"/"+file).dropna()
    else:
        df = pd.read_pickle(root + '/' + file).dropna()
        
    if len(df) == 0:
        return
       
    print('Processing '+file)
    
    for _, row in tqdm(df.iterrows()): 
        freqs = Counter()
        
        if len(row[column].strip()) > 1:
            
            try:
                sent_tokens = word_tokenize(row[column]) 

                for word in sent_tokens: 
                    wordnet_word = wordnetLASDict.get(word.lower())
                    if wordnet_word is not None:
                        emotion = sentiSenseDict.get(wordnet_word[0:-2])
                        if emotion is not None:
                            freqs[emotion] += 1
            except IndexError:
                freqs = {'tweetId':row['tweetId']}
        else:
                freqs = {'tweetId':row['tweetId']}
        
        freqs['tweetId'] = row['tweetId']
        list_emotions.append(freqs)
    
    df = pd.DataFrame(list_emotions)
    df = df.set_index('tweetId')
    df = df.div(df.sum(axis=1), axis=0) # relative weight in row
    
    df.to_csv(root+'/'+'sentiwordnet_'+file)

def loop_senti_wordnet(datapath,prefix,column='text'):
    path = os.walk(datapath)
    for root, _, files in path:
        for file in tqdm(files):
            if not file.startswith(prefix) or os.path.exists(root+'/'+'sentiwordnet_'+file):
                continue
            
            senti_wordnet(root,file,column=column)

In [None]:
datapath = "./" 
prefix = "day_"
dir_outs = './'

In [None]:
categories_file = dir_root + "/data/es_categories.tsv"

loop_process_emphat(datapath,prefix,categories_file=categories_file)

categories_file = dir_root + '/data/es_categories_mental_health.tsv'
name = 'mental_'
loop_process_emphat(datapath,prefix,categories_file=categories_file,name=name)

categories_file = dir_root + '/data/es_categories_crisis.tsv'
name = 'crisis_'
loop_process_emphat(datapath,prefix,categories_file=categories_file,name=name)

loop_senti_wordnet(datapath,prefix,column='text')

In [None]:
# depending on the number of files or tweets to process, a unification of day files might be needed -- this loops over all created day_ files
file_names = defaultdict(set)

already_computed = defaultdict(set) if not os.path.exists(dir_outs + 'already_processed_files.pickle') else pd.read_pickle(dir_outs + 'already_processed_files.pickle')

for ff in tqdm(os.listdir(dir_outs)):
        
    if ff.endswith('.zip') or ff.startswith('processed'):
        continue
    
    if ff.startswith('day') or ff.startswith('_') or ff.startswith('tweets_filtered') or ff.startswith('__alr') or ff.startswith('already'):
        continue
                        
    if '_'.join(ff.split('_')[0:-1]) not in already_computed or not ff in already_computed['_'.join(ff.split('_')[0:-1])]:
        file_names['_'.join(ff.split('_')[0:-1])].add(ff)

for key,files in file_names.items():
    print(key,len(files))

In [None]:
import pickle

for key,files in tqdm(file_names.items()):
    print(key)
    dd = pd.DataFrame()
    for ff in tqdm(files):
        try:
            aa = pd.read_csv(dir_outs + ff)
            dd = pd.concat([dd,aa])
        except pd.errors.EmptyDataError:
            pass
    dd = dd.drop_duplicates(subset=['tweetId'])
    
    if not os.path.exists(dir_outs + 'processed_files/' + key + '.csv'):
        dd.to_csv(dir_outs + 'processed_files/' + key + '.csv')
    else:
        dd = pd.concat([dd,pd.read_csv(dir_outs + 'processed_files/' + key + '.csv')])
        dd = dd.drop_duplicates(subset=['tweetId'])
        dd.to_csv(dir_outs + 'processed_files/' + key + '.csv')
    
    # here we update the already_processed_files, so in next evaluation there's no need to reprocess this
    already_computed[key].update(files)
    with open(dir_outs + 'already_processed_files.pickle','wb') as fil:
        pickle.dump(already_computed,fil)


In [None]:
import os

def get_representation(df,column,which='median',file=None):
        
    if which == 'median':
        return df[column].median()
    
    if which == 'mean':
        return df[column].mean()

    if which == 'per_tweets': 
        if len(df[column]) == 0: 
            return 0
        else:
            return len(df[df[column] > 0]) / len(df[column])

    df_aux = pd.read_csv(file)

    df_aux = pd.concat([pd.DataFrame(df[column]).reset_index(),df_aux],axis=1)
    
    return len(set(~df_aux[df_aux[column].isna()]['userId'])) / len(set(df_aux['userId']))

def load_categories(datapath,prefix,which='median',selected_cats=None,summarized=True): 

    path = os.walk(datapath)

    values = {}
    cats = set() 

    for root, _, files in path:
        for file in tqdm(files):

            if not file.startswith(prefix):
                continue
                
            if file.endswith('.zip'):
                continue
                
            df = pd.read_csv(root+'/'+file)
            df = df.set_index('tweetId')
            cats.update(df.columns)
            
            dicti = {}
            for c in df.columns:
                if selected_cats is None or c in selected_cats: 
                  
                    if summarized:                    
                        dicti[c] = get_representation(df,column=c,which=which,file=root+'/'+'day'+file.replace(prefix,''))
                    else: 
                        dicti[c] = df[c].dropna().values 
                   
            values[file] = dicti
    
    dates = list(values.keys())
    dates.sort()
    
    if selected_cats is not None:
        return values,dates,selected_cats
    
    return values,dates,cats

def transform_df(values,dates,cat): 
    ee = pd.DataFrame()
    for d in dates:
        vc = values[d].get(cat)
        if vc is not None:
            ee = pd.concat([ee,pd.DataFrame(vc,columns=[d])], axis=1,copy=False)
    return ee

In [None]:
prefix = 'empath_day'

cati = None
valstD, datestD, catstD = load_categories(datapath,prefix,which='per_tweets',selected_cats=cati,summarized=True)

dataftD = pd.DataFrame(valstD,columns=datestD) # matrix
dataftD.to_csv(f"{datapath}/all_{prefix}_full.csv")
dataftD

prefix = 'sentiwordnet_day'
cati = None
valstD, datestD, catstD = load_categories(datapath,prefix,which='per_tweets',selected_cats=cati,summarized=True)

dataftD = pd.DataFrame(valstD,columns=datestD) # matrix
dataftD.to_csv(f"{datapath}/all_{prefix}_full.csv")
dataftD

prefix = 'empath_crisis'
cati = None
valstD, datestD, catstD = load_categories(datapath,prefix,which='per_tweets',selected_cats=cati,summarized=True)

dataftD = pd.DataFrame(valstD,columns=datestD) # matrix
dataftD.to_csv(f"{datapath}/all_{prefix}_full.csv")
dataftD

prefix = 'empath_mental'
cati = None
valstD, datestD, catstD = load_categories(datapath,prefix,which='per_tweets',selected_cats=cati,summarized=True)

dataftD = pd.DataFrame(valstD,columns=datestD) # matrix
dataftD.to_csv(f"{datapath}/all_{prefix}_full.csv")
dataftD

In [None]:
# in case we want to adjust column names

aa = pd.read_csv(f"{datapath}/all_sentiwordnet_day_full.csv",index_col=0).T.fillna(0)
col_names = {}
for x in aa.columns:
    cc = x.replace('sentiwordnet_day_','').replace('.csv','')
    cc = cc.split('-')
    cc = cc[0] + '-' + (cc[1] if len(cc[1]) == 2 else '0' + cc[1]) + '-' + (cc[2] if len(cc[2]) == 2 else '0' + cc[2])
    col_names[x] = cc


aa = aa.rename(columns=col_names)
aa.to_csv(f'{datapath}11.Tweets_emotions.csv')


aa = pd.read_csv(f"{datapath}/all_empath_day_full.csv",index_col=0).T.fillna(0)
col_names = {}
for x in aa.columns:
    cc = x.replace('empath_day','').replace('.csv','')
    cc = cc.split('-')
    cc = cc[0] + '-' + (cc[1] if len(cc[1]) == 2 else '0' + cc[1]) + '-' + (cc[2] if len(cc[2]) == 2 else '0' + cc[2])
    col_names[x] = cc


aa = aa.rename(columns=col_names)
aa.to_csv(f'{datapath}10.Tweets_empath.csv')