# Annotating CSV files with sentiment info

In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import datetime

## Loading Seniment Dictionary - DK

In [3]:
#load sentiment dictionary
with open('sentdict_pos_DK.txt', 'r') as f:
    poslist = [el.split('\t') for el in f.readlines()]
    
with open('sentdict_neg_DK.txt', 'r') as f:
    neglist = [el.split('\t') for el in f.readlines()]

#aggregate sentiment words and their scores
words = np.array([line[0]+'-'+line[1] for line in poslist] + 
                 [line[0]+'-'+line[1] for line in neglist])
scores = np.array([float(line[2].rstrip()) for line in poslist] + 
                  [float(line[2].rstrip()) for line in neglist]) 
    
#create series to function as dictionary -- allows for vectorized search
sent_dict = dict(zip(words, scores))

## Loading Seniment Dictionary - BG

In [2]:
#load sentiment dictionary
with open('sentdict_pos_BG.txt', 'r') as f:
    poslist = [el.split('\t') for el in f.readlines()]
    
with open('sentdict_neg_BG.txt', 'r') as f:
    neglist = [el.split('\t') for el in f.readlines()]

#aggregate sentiment words and their scores
words = np.array([line[0]+'-'+line[1] for line in poslist] + 
                 [line[0]+'-'+line[1] for line in neglist])
scores = np.array([float(line[2].rstrip()) for line in poslist] + 
                  [float(line[2].rstrip()) for line in neglist]) 
    
#create series to function as dictionary -- allows for vectorized search
sent_dictbg = dict(zip(words, scores))

## Functions for reading and scoring

In [3]:
def read_and_score(filename, lang, retur = False):
    "Reads .csv files and extracts lemmas with PoS tags, joining them into one string per word"
    
    df = pd.read_csv(filename, dtype = object, delimiter = "\t") #read file
    
    #extract lemmas
    lemmas = (df['lemma'] + '-' + df['pos']).to_numpy()
    
    #score lemmas and add to dataframe
    df['sentiment'] = score_doc(lemmas, lang)
    
    #save to csv
    new_name = create_file(filename)
    df.to_csv(new_name)
    
    if retur:#option for returning file during testing
        return df

In [4]:
def create_file(filename):
    "Creates filepath for new file based on old one (input)."
    
    root, name = os.path.split(filename)
    root, year = os.path.split(root)
    name, ext = os.path.splitext(name)
    name = name + '.sent' + ext
    newpath = os.path.join(newroot, year, name)
        
    return newpath

In [5]:
def score_doc(array, lang):
    "Scores each word in array according to sentiment dictionary, except between negations and punctuation."
    
    if lang == "dk":
        res = np.array([sent_dict.get(lemma, np.nan) for lemma in array])
    elif lang == "bg":
        res = np.array([sent_dictbg.get(lemma, np.nan) for lemma in array])
    neg_intervals = create_intervals(array, lang)
    if len(neg_intervals) > 0:
        res[neg_intervals] = np.nan
    return res

In [6]:
def create_intervals(array, lang):
    "Creates intervals between negations and punctuation for neutralizing sentiment"
    
    if lang == 'dk':
        negs = ["ikke", "hverken", "ingen", "aldrig"]
    if lang == 'bg':
        negs = ["не", "нито", "никой", "никога"] 
    
    wordlist = [line.split("-")[0] for line in array]
    taglist = [line.split("-")[1] for line in array]
    
    negword_indices = np.where([lemma in negs for lemma in wordlist])[0]
    punct_indices = np.where([tag == 'PUNCT' for tag in taglist])[0]
    intervals = []
    
    for i in negword_indices:
        for j in punct_indices:
            if j > i:
                intervals += [i for i in range(i, j)]
                break
    return np.array(intervals)

## Creating tagged corpus - DK

In [12]:
#Create directory for tagged files
newroot= "ParlaMint-DK.TEI.CSV.SENT"
if not os.path.exists(newroot):
    os.makedirs(newroot)

In [13]:
#Create subdirectories by year for tagged files
for root, dirs, files in os.walk("ParlaMint-DK.TEI.CSV", topdown=False):
    for name in dirs:
        #creating new folder
        if name.isnumeric():
            folderpath = os.path.join(newroot, name)
            if not os.path.exists(folderpath):
                os.makedirs(folderpath)

In [16]:
#looping over corpus
for root, dirs, files in os.walk("ParlaMint-DK.TEI.CSV", topdown=False):
    for name in tqdm(files):
        if name.endswith('ana.csv'):
            name = os.path.join(root, name)
            read_and_score(name, "dk")

100%|███████████████████████████████████████████| 39/39 [00:15<00:00,  2.58it/s]
100%|███████████████████████████████████████████| 79/79 [00:35<00:00,  2.21it/s]
100%|█████████████████████████████████████████| 100/100 [00:51<00:00,  1.94it/s]
100%|█████████████████████████████████████████| 112/112 [01:03<00:00,  1.76it/s]
100%|█████████████████████████████████████████| 102/102 [00:49<00:00,  2.04it/s]
100%|█████████████████████████████████████████| 137/137 [01:14<00:00,  1.84it/s]
100%|█████████████████████████████████████████| 153/153 [01:13<00:00,  2.09it/s]
100%|█████████████████████████████████████████| 109/109 [00:56<00:00,  1.94it/s]
100%|█████████████████████████████████████████| 116/116 [01:08<00:00,  1.70it/s]
0it [00:00, ?it/s]
100%|██████████████████████████████████████████| 1/1 [00:00<00:00, 24966.10it/s]


## Creating tagged corpus - BG

In [7]:
#Create directory for tagged files
newroot= "ParlaMint-BG.TEI.CSV.SENT"
if not os.path.exists(newroot):
    os.makedirs(newroot)

In [8]:
#Create subdirectories by year for tagged files
for root, dirs, files in os.walk("ParlaMint-BG.TEI.CSV", topdown=False):
    for name in dirs:
        #creating new folder
        if name.isnumeric():
            folderpath = os.path.join(newroot, name)
            if not os.path.exists(folderpath):
                os.makedirs(folderpath)

In [9]:
#looping over corpus
for root, dirs, files in os.walk("ParlaMint-BG.TEI.CSV", topdown=False):
    for name in tqdm(files):
        if name.endswith('ana.csv'):
            name = os.path.join(root, name)
            read_and_score(name, "bg")
    

100%|███████████████████████████████████████████| 27/27 [00:10<00:00,  2.68it/s]
100%|███████████████████████████████████████████| 87/87 [00:38<00:00,  2.26it/s]
100%|█████████████████████████████████████████| 132/132 [00:46<00:00,  2.85it/s]
100%|█████████████████████████████████████████| 101/101 [00:28<00:00,  3.54it/s]
100%|█████████████████████████████████████████| 122/122 [00:31<00:00,  3.90it/s]
100%|███████████████████████████████████████████| 70/70 [00:24<00:00,  2.86it/s]
100%|█████████████████████████████████████████| 122/122 [00:35<00:00,  3.47it/s]
100%|█████████████████████████████████████████| 132/132 [00:40<00:00,  3.25it/s]
100%|█████████████████████████████████████████| 128/128 [00:40<00:00,  3.17it/s]
100%|██████████████████████████████████████████| 1/1 [00:00<00:00, 35848.75it/s]


# Aggregating data 

    # see Rheault et al. about representing the divergence in terms of deviation from the mean
    # remember to divide by number of words when assessing sentiment score

## Scoring all speeches - DK

In [31]:
big_df = pd.DataFrame({"ID": [], "Sentiment":[], "Word_total":[], "Score":[], "Title":[], "Date":[],  "Body":[],
                       "Term":[],"Session":[], "Meeting":[], "Sitting":[],"Agenda":[], "Subcorpus": [], 
                       "Speaker_role": [],"Speaker_MP": [], "Speaker_Minister": [], "Speaker_party": [],  
                       "Speaker_party_name": [], "Party_status":[], "Speaker_name":[], "Speaker_gender":[], 
                       "Speaker_birth":[], "Topic_title":[], "Question":[], "Debate":[]})
big_df.to_csv("ParlaMint-DK-SENT.csv")

for root, dirs, files in os.walk("ParlaMint-DK.TEI.CSV.SENT", topdown=False):
    files = [name for name in files if name.endswith("ana.sent.csv")]
    for name in tqdm(files):
        sentfile = os.path.join(root, name)
        metaroot = "ParlaMint-DK.txt"
        year = os.path.split(root)[1]
        metafile = os.path.join(metaroot, year, name.rstrip("ana.sent.csv") + "-meta.csv")
    
        df = pd.read_csv(sentfile, dtype = object, index_col = 0)
        df = df[df["pos"] != "PUNCT" ] #remove punctuation from word count
        meta = pd.read_csv(metafile, delimiter = "\t", index_col = 0)
        joined = df.join(meta.set_index("ID"), on = 'id' )
        
        totals = df.groupby('id')['lemma'].count().astype(float)
        df['sentiment'] = df['sentiment'].astype(float)
        sentiment = df.groupby('id')['sentiment'].sum()
        speech_scores = sentiment/totals
        
        
        #try weighting the negatives more 
        
        new_frame = pd.DataFrame({"Sentiment":sentiment, "Word_total":totals, 
                                  "Score":speech_scores}).reset_index()
        new_joined = new_frame.join(meta.set_index("ID"), on = 'id')
        new_joined.to_csv("ParlaMint-DK-SENT.csv", mode='a', header=False)


100%|█████████████████████████████████████████████| 39/39 [00:04<00:00,  8.83it/s]
100%|█████████████████████████████████████████████| 79/79 [00:08<00:00,  8.78it/s]
100%|███████████████████████████████████████████| 100/100 [00:13<00:00,  7.62it/s]
100%|███████████████████████████████████████████| 112/112 [00:15<00:00,  7.21it/s]
100%|███████████████████████████████████████████| 102/102 [00:12<00:00,  8.03it/s]
100%|███████████████████████████████████████████| 137/137 [00:18<00:00,  7.50it/s]
100%|███████████████████████████████████████████| 153/153 [00:17<00:00,  8.65it/s]
100%|███████████████████████████████████████████| 109/109 [00:14<00:00,  7.54it/s]
100%|███████████████████████████████████████████| 116/116 [00:17<00:00,  6.81it/s]
0it [00:00, ?it/s]


## Add Information

    #add Left-Right annotation?
    #add Government name? E.g. ThorningII, RasmussenII, RasmussenIII, FrederiksenI

In [32]:
df_dk = pd.read_csv("ParlaMint-DK-SENT.csv", delimiter = ",", dtype = object)
df_dk["YearMonth"] = df_dk["Date"].str[:-3]

### Government info based on dates

In [33]:
def date(datelist):
    d = datelist.split("-")
    return datetime.datetime(int(d[0]), int(d[1]), int(d[2]))

In [34]:
datearray = df_dk["Date"].to_numpy(dtype= str)
datearray.shape

(398610,)

In [36]:
datearray = np.array([date(x) for x in datearray])

In [37]:
#dates taken from file 
thstart, thend = datetime.datetime(2014, 2, 3), datetime.datetime(2015, 6, 27) #Thorning-Schmidt II government
ra2start, ra2end = datetime.datetime(2015, 5, 28), datetime.datetime(2016, 11, 27) #Rasumssen II government
ra3start, ra3end = datetime.datetime(2016, 11, 28), datetime.datetime(2019, 6, 5) #Rasmussen III government
frstart, frend = datetime.datetime(2019, 6, 7), datetime.datetime(2022, 6, 7) #Frederiksen I government

In [38]:
#index intervals
thorning = np.where(np.logical_and(thstart <= datearray, datearray <= thend))
rasmussenii = np.where(np.logical_and(ra2start <= datearray, datearray <= ra2end))
rasmusseniii = np.where(np.logical_and(ra3start <= datearray, datearray <= ra3end))
frederiksen = np.where(np.logical_and(frstart <= datearray, datearray <= frend))

In [39]:
government = np.empty((len(df_dk), 1), dtype = object) #create array

In [40]:
government[thorning, 0] = "Thorning-Schmidt II"
government[rasmussenii, 0] = "Rasmussen II"
government[rasmusseniii, 0] = "Rasmussen III"
government[frederiksen, 0] = "Frederiksen I"

In [41]:
df_dk["Government"] = government

### Political Orientation (Left/Right)

In [42]:
left = ["S", "RV", "SF", "EL", "ALT"]
right = ["V", "KF", "LA", "DF", "NB"]

In [43]:
partyarray = df_dk["Speaker_party"].to_numpy()

In [44]:
bloc = np.empty((len(df_dk), 1), dtype = object)

In [45]:
lindex = df_dk["Speaker_party"].isin(left)
rindex = df_dk["Speaker_party"].isin(right)

In [46]:
bloc[lindex] = "Left"
bloc[rindex] = "Right"

In [47]:
df_dk["Bloc"] = bloc

### Save

In [2]:
df_dk.to_csv("ParlaMint-DK-SENT.csv", sep = ",", index = False)

NameError: name 'df_dk' is not defined

## Scoring all speeches - BG

In [113]:
big_bg = pd.DataFrame({"ID": [], "Sentiment":[], "Word_total":[], "Score":[], "Negscore":[], "Title":[], 
                       "Date":[],  "Body":[],
                       "Term":[],"Session":[], "Meeting":[], "Sitting":[],"Agenda":[], "Subcorpus": [], 
                       "Speaker_role": [],"Speaker_MP": [], "Speaker_Minister": [], "Speaker_party": [],  
                       "Speaker_party_name": [], "Party_status":[], "Speaker_name":[], "Speaker_gender":[], 
                       "Speaker_birth":[], "Topic_title":[], "Meeting_type":[], "Comission": []})
big_bg.to_csv("ParlaMint-BG-SENT.csv")

big_bg = pd.DataFrame({"sent":[], "word_total":[], "score":[]})

for root, dirs, files in os.walk("ParlaMint-BG.TEI.CSV.SENT", topdown=False):
    for name in tqdm(files):
        if name.endswith("ana.sent.csv"):
            sentfile = os.path.join(root, name)
            metaroot = "ParlaMint-BG.txt"
            year = os.path.split(root)[1]
            metafile = os.path.join(metaroot, year, name.rstrip("ana.sent.csv") + "-meta.csv")

            df = pd.read_csv(sentfile, dtype = object, index_col = 0)
            df = df[df["pos"] != "PUNCT" ] #remove punctuation from word count
            meta = pd.read_csv(metafile, delimiter = "\t", index_col = 0)
            joined = df.join(meta.set_index("ID"), on = 'id' )

            totals = df.groupby('id')['lemma'].count().astype(float)
            df['sentiment'] = df['sentiment'].astype(float)
            sentiment = df.groupby('id')['sentiment'].sum()
            speech_scores = sentiment/totals
            
            #weighting negative words more
            df["negscore"] = df["sentiment"] #copy
            negative_mask = df["negscore"] < 0
            df.loc[negative_mask, "negscore"] = df.loc[negative_mask, "negscore"]*2
            negscores = df.groupby("id")["negscore"].sum()

            new_frame = pd.DataFrame({"Sentiment":sentiment, "Word_total":totals, 
                                      "Score":speech_scores, "Negscore":negscores}).reset_index()
            new_joined = new_frame.join(meta.set_index("ID"), on = 'id')
            new_joined.to_csv("ParlaMint-BG-SENT.csv", mode='a', header=False)


100%|███████████████████████████████████████████| 28/28 [00:10<00:00,  2.63it/s]
100%|███████████████████████████████████████████| 88/88 [00:12<00:00,  7.25it/s]
100%|█████████████████████████████████████████| 133/133 [00:14<00:00,  9.18it/s]
100%|█████████████████████████████████████████| 102/102 [00:09<00:00, 10.51it/s]
100%|█████████████████████████████████████████| 123/123 [00:10<00:00, 11.45it/s]
100%|███████████████████████████████████████████| 71/71 [00:07<00:00,  9.44it/s]
100%|█████████████████████████████████████████| 123/123 [00:11<00:00, 11.15it/s]
100%|█████████████████████████████████████████| 133/133 [00:13<00:00, 10.13it/s]
100%|█████████████████████████████████████████| 129/129 [00:13<00:00,  9.64it/s]
100%|██████████████████████████████████████████| 1/1 [00:00<00:00, 29330.80it/s]


## Add information

In [2]:
df_bg = pd.read_csv("ParlaMint-BG-SENT.csv", delimiter = ",", dtype = object)
df_bg["YearMonth"] = df_bg["Date"].str[:-3]

#### Correcting party annotation (cyrillic to latin)

In [43]:
rows = df_bg.loc[df_bg["Speaker_party"] == 'ГЕРБ'].index
df_bg.loc[rows, "Speaker_party"] = "GERB"

In [44]:
rows = df_bg.loc[df_bg["Speaker_party"] == 'ДПС'].index
df_bg.loc[rows, "Speaker_party"] = "MRF"

### Government info based on dates

In [45]:
def date(datelist):
    d = datelist.split("-")
    return datetime.datetime(int(d[0]), int(d[1]), int(d[2]))

In [46]:
datearray = df_bg["Date"].to_numpy(dtype= str)
datearray.shape

(210017,)

In [47]:
datearray = np.array([date(x) for x in datearray])

In [48]:
#dates taken from file 
blstart, blend = datetime.datetime(2014, 8, 11), datetime.datetime(2014, 11, 7) #Bliznashki
bb2start, bb2end = datetime.datetime(2014, 11, 7), datetime.datetime(2017, 1, 27) #Borisov II
gestart, geend = datetime.datetime(2017, 1, 27), datetime.datetime(2017, 5, 4) #Gerdzhikov
bb3start, bb3end = datetime.datetime(2017, 5, 4), datetime.datetime(2021, 4, 15) #Borisov III
ya1start, ya1end = datetime.datetime(2021, 5, 12), datetime.datetime(2021, 9, 16) #Yanev I
ya2start, ya2end = datetime.datetime(2021, 9, 16), datetime.datetime(2021, 12, 13) #Yanev II
pestart, peend = datetime.datetime(2021, 12, 13), datetime.datetime(2022, 8, 2) #Petkov

In [49]:
#index intervals
bliznashki = np.where(np.logical_and(blstart <= datearray, datearray < blend))
borisov2 = np.where(np.logical_and(bb2start <= datearray, datearray < bb2end))
gerdzhikov = np.where(np.logical_and(gestart <= datearray, datearray < geend))
borisov3 = np.where(np.logical_and(bb3start <= datearray, datearray < bb3end))
yanev1 = np.where(np.logical_and(ya1start <= datearray, datearray < ya1end))
yanev2 = np.where(np.logical_and(ya2start <= datearray, datearray < ya2end))
petkov = np.where(np.logical_and(pestart <= datearray, datearray < peend))

In [50]:
government = np.empty((len(df_bg), 1), dtype = object) #create array
caretaker = np.empty((len(df_bg), 1), dtype = object) 

In [51]:
government[bliznashki, 0] = "Bliznashki"
caretaker[bliznashki, 0] = True

government[borisov2, 0] = "Borisov II"
caretaker[borisov2, 0] = False

government[gerdzhikov, 0] = "Gerdzhikov"
caretaker[gerdzhikov, 0] = True

government[borisov3, 0] = "Borisov III"
caretaker[borisov3, 0] = False

government[yanev1, 0] = "Yanev I"
caretaker[yanev1, 0] = True

government[yanev2, 0] = "Yanev II"
caretaker[yanev2, 0] = True

government[petkov, 0] = "Petkov"
caretaker[petkov, 0] = False

In [52]:
df_bg["Government"] = government
df_bg["Caretaker"] = caretaker

### Political Orientation (Left/Right)

In [53]:
row = df_bg.loc[df_bg["Speaker_party"] == "DB;WCC"].index
df_bg.loc[row, "Speaker_party"] = "WCC"

In [54]:
left = ["BSPLB", "ABV", "WCC", "BSPFB", "RUBGWC", "RUTO", ]
right = ["RB", "PF", "BDC-NU", "AP", "GERB", "MRF", "TISP", "GERB-UDF", "RP", "DB", "UP", "VOLYA"]

In [55]:
partyarray = df_bg["Speaker_party"].to_numpy()

In [56]:
bloc = np.empty((len(df_bg), 1), dtype = object)

In [57]:
lindex = df_bg["Speaker_party"].isin(left)
rindex = df_bg["Speaker_party"].isin(right)

In [58]:
bloc[lindex] = "Left"
bloc[rindex] = "Right"

In [59]:
df_bg["Bloc"] = bloc

### Save to file

In [60]:
df_bg.to_csv("ParlaMint-BG-SENT.csv", sep = ",", index = False)