<div class="alert alert-info" style="background-color:#5d3a8e; color:white; padding:0px 10px; border-radius:5px;"><h1 style='margin:10px 5px'> 
Master Thesis Yannik Haller - Sentiment Analysis NAIVE Classifier
</h1>
</div>

<div class="alert alert-info" style="background-color:#5d3a8e; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'> 
1. Load required packages and the data
</h2>
</div>

In [1]:
# Import required baseline packages
import re
import os
import glob
import time
import sys
import pandas as pd
import numpy as np
from pprint import pprint

# Change pandas' setting to print out long strings
pd.options.display.max_colwidth = 200

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Spacy (for lemmatization)
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
%matplotlib inline

# TextBlob (for Sentiment Analysis)
from textblob import Blobber
from textblob_de import PatternTagger, PatternAnalyzer

# Enable logging for gensim (optional)
import logging
logging.basicConfig(format = '%(asctime)s : %(levelname)s : %(message)s', level = logging.ERROR)

import warnings
warnings.filterwarnings("ignore", category = DeprecationWarning)
warnings.filterwarnings("ignore", category = FutureWarning)

  def _figure_formats_changed(self, name, old, new):


In [2]:
# Set the appropriate working directory
os.chdir('D:\\Dropbox\\MA_data')

In [3]:
# Define a function to read in the fully preprocessed data
def read_preprocessed(language, tokenize = True):
    # Raise an error if an inadmissible language is chosen
    allowed_languages = ['de', 'en', 'fr', 'it']
    if language not in allowed_languages:
        raise ValueError("Invalid language. Expected one of: %s" % allowed_languages)
    
    # Set the appropriate working directory
    os.chdir('D:\\Dropbox\\MA_data')

    # Define the name of the file to load
    filename = "Preprocessed/Sentiment_Analysis/"+language+"_preprocessed_senti.csv"

    # Read in the dataframe containing the text data
    tx_pp = pd.read_csv(filename, index_col = 0, dtype = {'tx': object})

    # Get the articles' index together with an enumeration to identify their position in the list of precleaned articles
    idx = tx_pp.index
    idx = pd.DataFrame(idx, columns = [language+'_idx'])

    # Reduce the dataframe to a list containing the text data
    tx_pp = tx_pp.tx.to_list()

    # Tokenize the data again if tokenize = True (RAM-saving)
    if tokenize:
        tx_pp = retokenize(tx_pp)

    # Return the preprocessed data
    return tx_pp, idx

# Define a function to retokenize the preprocessed text data (RAM-saving)
def retokenize(article_list):
    for i in range(len(article_list)):
        temp_tx = str(article_list[i]).split()
        article_list[i] = temp_tx
    return article_list

In [4]:
# Read in the preprocessed data (tokenized)
it_tx, it_idx = read_preprocessed('it', tokenize = True)

# Take a look at the size of the precleaned data
sys.getsizeof(it_tx)

189032

In [5]:
# Take a look at the preprocessed data
it_tx[0]

['fermati',
 'obbligare',
 'oppresso',
 'mondare',
 'ginocchio',
 'forse',
 'applaudivamo',
 'stadio',
 'ora',
 'acclamare',
 'balcone',
 'ricco',
 'pallone',
 'splendido',
 'medico',
 'togliere',
 'stringere',
 'mano',
 'cosa',
 'molto',
 'baciare',
 'contatto',
 'solere',
 'sfiorarsi',
 'soffiare',
 'vento',
 'soli',
 'solcare',
 'seme',
 'chiederci',
 'perché',
 'convinti',
 'vita',
 'non',
 'perdere',
 'neanche',
 'istante',
 'lunare',
 'pieno',
 'bellezza',
 'verità',
 'tornire',
 'compiere',
 'cantare',
 'non',
 'camminare',
 'mai',
 'solo',
 'dubitiamo',
 'esperto',
 'nullo',
 'lasciamo',
 'pipistrello',
 'maestro',
 'biodiversità',
 'colpa',
 'usciamo',
 'inferno',
 'togliere',
 'respirare',
 'reagiamo',
 'torrente',
 'vita',
 'dissetiamoci',
 'fonte',
 'naturale',
 'sempre',
 'convinto',
 'amore',
 'salvare',
 'unico',
 'rispondere',
 'vincere',
 'felicità',
 'serenità',
 'tornare',
 'sicuri',
 'dio',
 'sconfiggere',
 'anche',
 'coronavirus',
 'rodolfo',
 'fasani',
 'mesocco']

In [6]:
# Take a look at the dataframe containing the according index
it_idx.tail(3)

Unnamed: 0,it_idx
23618,2425113
23619,2425114
23620,2425115


In [7]:
# Retrieve the location of the article in the preprocessed data using the according article id
article_ids = [2425114, 2425115]
location = it_idx[it_idx.it_idx.isin(article_ids)].index.tolist() #23619

# Access the preprocessed text from the articles with the article ids in [2425114, 2425115]
#list(it_tx[i] for i in location)

# Look at the according location of the articles with the article ids in [2425114, 2425115]
location

[23619, 23620]

<div class="alert alert-info" style="background-color:#5d3a8e; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'> 
2. Sentiment assessment of the articles
</h2>
</div>

In [8]:
# Read in the sentiment lexicon Sentix (i.e. an Italian lexicon for sentiment analysis) as a dataframe
senti_lex_df = pd.read_csv("Sentiment/Naive/Italian/sentix.txt", sep = '\t', header = None, names = ['lemma','POS','ID','pos_score','neg_score','polarity','intensity'])
# Lowercase the entries in the column 'lemma'
senti_lex_df['lemma'] = senti_lex_df['lemma'].str.lower()
# Relabel the POS Tags to a common standard
senti_lex_df['POS'].replace({'a': 'ADJ', 'n': 'NOUN', 'v': 'VERB', 'r': 'ADV'}, inplace = True)
# Remove exact duplicates
n_duplicates = sum(senti_lex_df.duplicated())
senti_lex_df.drop_duplicates(keep = 'first', inplace = True, ignore_index = True)
print(n_duplicates, "exactly duplicated entries have been removed.")

17647 exactly duplicated entries have been removed.


In [9]:
# Calculate the average polarity of all duplicated words that are assigned with the same POS
senti_lex_df = senti_lex_df.groupby(['POS','lemma'])['polarity'].mean().reset_index()
# Sort the dataframe according to the alphabetical order of the POS-tags, such that first all ADJs appear, which then are followed by ADVs, NOUNs and then VERBs
senti_lex_df.sort_values(['POS','lemma'], inplace = True)
# Remove duplicates, while the first appearing lemma is kept (hence, in case of duplicated lemmas that have different POS-tags, first adjectives are kept, then adverbs, then nouns and then verbs)
n_duplicates = sum(senti_lex_df.duplicated(subset = ['lemma'], keep = 'first'))
print(n_duplicates, "duplicated lemmas with differing POS-tags have been removed.")
senti_lex_df.drop_duplicates(subset = ['lemma'], keep = 'first', inplace = True, ignore_index = True)

675 duplicated lemmas with differing POS-tags have been removed.


In [10]:
# Take a look at the dataframe containing the sentiment lexicon
senti_lex_df

Unnamed: 0,POS,lemma,polarity
0,ADJ,1,-1.000000
1,ADJ,a_bassa_quota,-1.000000
2,ADJ,a_bassa_risoluzione,1.000000
3,ADJ,a_basso_contenuto_tecnologico,1.000000
4,ADJ,a_basso_livello,-1.000000
...,...,...,...
41794,VERB,voler_bene,0.688083
41795,VERB,volerci,1.000000
41796,VERB,vulnerare,-1.000000
41797,VERB,zoomare,1.000000


In [11]:
# Create a dictionary out of the sentiment lexicon
senti_lex_dict = {}
for index, row in senti_lex_df.iterrows():
    senti_lex_dict[row['lemma']] = {'POS': str(row['POS']), 'polarity': float(row['polarity'])}

In [12]:
# Remove unnecessary variables to save RAM
del senti_lex_df

In [13]:
# Define the set of possible negations
negations = ['no', 'non', 'niente', 'nessuno']

In [14]:
# Empirically derived mean sentiment intensity rating increase for booster words (adapted from the VADER module)
# Note: The values have been devided by 4, because we are working with polarities directly (which range from -1 to 1) instead of the unscaled crowd ratings (which range from -4 to 4)
B_INCR = 0.293/4
B_DECR = -0.293/4

# Define the dictionary of booster words
booster_dic = \
    {"assolutamente": B_INCR, "assoluto": B_INCR, "assoluta": B_INCR, "totalmente": B_INCR, "totale": B_INCR, #"absolutely": B_INCR,
     "sorprendente": B_INCR, "mirabolante": B_INCR, "stupefacente": B_INCR, "straordinario": B_INCR, "straordinaria": B_INCR, "strabiliante": B_INCR, #"amazingly": B_INCR,
     "enorme": B_INCR, "esorbitante": B_INCR, "immenso": B_INCR, "immensa": B_INCR, "colossale": B_INCR, #"awfully": B_INCR,
     "completo": B_INCR, "completa": B_INCR, "intero": B_INCR, "intera": B_INCR,    #"completely": B_INCR,
     "considerevole": B_INCR, "ingente": B_INCR, "notevole": B_INCR, "ragguardevole": B_INCR, "rilevante": B_INCR, "apprezzabile": B_INCR, "cospicuo": B_INCR, "cospicua": B_INCR, #"considerably": B_INCR,
     "inequivocabile": B_INCR, "univoco": B_INCR, "univoca": B_INCR, "netto": B_INCR, "netta": B_INCR, "indubitato": B_INCR, "indubitata": B_INCR, #"decidedly": B_INCR,
     "fundamentale": B_INCR, #"deeply": B_INCR,
     "dannato": B_INCR, "dannata": B_INCR, #"effing": B_INCR,
     "oltremodo": B_INCR, "oltremisura": B_INCR, "sommamente": B_INCR, "squisitamente": B_INCR, "straordinariamente": B_INCR, #"enormously": B_INCR,
     #"entirely": B_INCR,
     "particolare": B_INCR, "particolarmente": B_INCR, "speciale": B_INCR, "specialmente": B_INCR, # "especially": B_INCR,
     "insolito": B_INCR, "insolita": B_INCR, "eccezionalmente": B_INCR,  #"exceptionally": B_INCR,
     "estremamente": B_INCR, "estremo": B_INCR, "estrema": B_INCR, #"extremely": B_INCR,
     "favoloso": B_INCR, "favolosa": B_INCR, "fantastico": B_INCR, #"fabulously": B_INCR,
     #"flipping": B_INCR,
     #"flippin": B_INCR,
     #"fricking": B_INCR,
     #"frickin": B_INCR,
     #"frigging": B_INCR,
     #"friggin": B_INCR,
     #"fully": B_INCR,
     #"fucking": B_INCR,
     "molto": B_INCR, "intensamente": B_INCR, "parecchio": B_INCR, "tanto": B_INCR, "massiccio": B_INCR, "massiccia": B_INCR,  #"greatly": B_INCR,
     #"hella": B_INCR,
     "supremo": B_INCR, "suprema": B_INCR, #"highly": B_INCR,
     "immensamente": B_INCR, "immenso": B_INCR, "immensa": B_INCR, #"hugely": B_INCR,
     "incredibile": B_INCR, #"incredibly": B_INCR,
     "intensamente": B_INCR, #"intensely": B_INCR,
     "principalmente": B_INCR, #"majorly": B_INCR,
     "più": B_INCR, #"more": B_INCR,
     "maggior": B_INCR, #"most": B_INCR,
     "particolarmente": B_INCR, "soprattutto": B_INCR, #"particularly": B_INCR,
     "puramente": B_INCR, "esclusivamente": B_INCR, #"purely": B_INCR,
     "abbastanza": B_INCR, "piuttosto": B_INCR, "alquanto": B_INCR, #"quite": B_INCR,
     "davvero": B_INCR, "veramente": B_INCR, #"really": B_INCR,
     "notevolmente": B_INCR, #"remarkably": B_INCR,
     "essenziale": B_INCR, "considerabilmente": B_INCR, #"substantially": B_INCR,
     "accuratamente": B_INCR, "completamente": B_INCR, #"thoroughly": B_INCR,
     #"totally": B_INCR,
     "tremendamente": B_INCR, "enormemente": B_INCR, #"tremendously": B_INCR,
     #"uber": B_INCR,
     "incredibilmente": B_INCR, #"unbelievably": B_INCR,
     "insolitamente": B_INCR, "inusualmente": B_INCR, #"unusually": B_INCR,
     #"utterly": B_INCR,
     #"very": B_INCR,
     #####
     "quasi": B_DECR, "pressoché": B_DECR, #"almost": B_DECR,
     "appena": B_INCR, "malapena": B_INCR, #"barely": B_DECR,
     "stento": B_DECR, #"hardly": B_DECR,
     "abbastanza": B_INCR, #"just enough": B_DECR,
     "alquanto": B_DECR, #"kind of": B_DECR,
     "tipo": B_INCR, #"kinda": B_DECR,
     #"kindof": B_DECR,
     #"kind-of": B_DECR,
     "meno": B_INCR, #"less": B_DECR,
     "piccolo": B_INCR, #"little": B_DECR,
     "esiguo": B_DECR, "esigua": B_DECR, "futile": B_DECR, "insignificante": B_DECR, "marginale": B_DECR,  #"marginally": B_DECR,
     "occasionale": B_DECR, "saltuario": B_DECR, "saltuaria": B_DECR, #"occasionally": B_DECR,
     "parziale": B_DECR, #"partly": B_DECR,
     "scarso": B_DECR, "scarsa": B_DECR, "rado": B_DECR, "rada": B_DECR, "scarsamente": B_DECR, "magro": B_DECR, "magra": B_DECR, #"scarcely": B_DECR,
     "poco": B_DECR, "briciolo": B_DECR, "pizzico": B_DECR, #"slightly": B_DECR,
     "piuttosto": B_DECR #"somewhat": B_DECR,
     #"sort of": B_DECR,
     #"sorta": B_DECR,
     #"sortof": B_DECR,
     #"sort-of": B_DECR
     }

In [15]:
# Set up the Sentiment Classifier class
class NaiveSentimentClassifierIT:
    def __init__(self, senti_lex_dict, negations, booster_dic):
        # Note:
        ## senti_lex_dict has to be a dictionary with entries of the following form: {token: {'POS': token_POStag, 'polarity': token_polarity_score}}
        ## negations has to be a list of negation words
        ## booster_dic has to be a dictionary with entries of the following form: {token: additive_polarity_impact_on_target_token}
        # Store the inputs within the corresponding attribute of self
        self.senti_lex_dict = senti_lex_dict
        self.negations      = negations
        self.booster_dic    = booster_dic
    
    # Set up a function that evaluates the polarity of the articles
    def evaluate(self, tx, idx, name_output_file = 'it_naive_polarity'):
        # Note: 
        ## tx has to be a list of tokenized articles (i.e. a list, whose elements are itself lists containing the tokenized and precleaned articles)
        ## --> (precleaned means lemmatized and filtered, such that only negations, nouns, verbs, adverbs and adjectives are contained)
        ## idx has to be a list containing the ordered article indexes corresponding to the articles in tx

        # Keep track of the processing time
        t = time.time()

        # Create an empty list to store the resulting document polarity scores
        article_polarity = []
        # Set up a loop to go through all articles
        for article in tx:
            # Apply the above defined functions to get the token's polarity scores, score adjustments (through booster words) and score multipliers (through negations)
            self.get_token_postag(article)
            self.get_token_polarity(article)
            self.get_score_adjustment(article)
            self.get_score_multiplier(article)
            self.get_article_polarity()
            # Apply the above defined function to calculate the final polarity of the article and append it to the variable article_polarity
            article_polarity.append(self.document_polarity)
        # Store the article polarities in self
        self.article_polarity = article_polarity

        # Print out the processing time
        print("Processing time to evaluate the article sentiments:", str((time.time() - t)/60), "minutes")

        # Create a correctly indexed dataframe containing the article sentiments
        Naive_tx_polarity = pd.DataFrame(article_polarity, index = idx, columns = ['Naive_polarity'])
        # Save the results to a csv file
        Naive_tx_polarity.to_csv("Sentiment/Naive/"+name_output_file+".csv", index = True)
        # Return the results
        return Naive_tx_polarity

    ## Define all functions needed within the Sentiment Classifier class

    # Define a function to get the POS-tag for each token in an article (given that the token is contained in the sentiment lexicon)
    def get_token_postag(self, article):
        # Create a list to store the results
        token_pos = []
        # Set up a loop to go through all tokens of the article
        for token in article:
            if token in self.senti_lex_dict:
                token_pos.append(self.senti_lex_dict[token]['POS'])
            else:
                token_pos.append('UNKNOWN')
        # Store the resulting list of the token POS-tags in self
        self.token_pos = token_pos

    # Define a function to get the polarity score for each token in an article (given that the token is contained in the sentiment lexicon)
    def get_token_polarity(self, article):
        # Create a list to store the results
        token_polarity = []
        # Set up a loop to go through all tokens of the article
        for token in article:
            if token in self.senti_lex_dict:
                token_polarity.append(self.senti_lex_dict[token]['polarity'])
            else:
                token_polarity.append(0)
        # Store the resulting list of the token polarities in self
        self.token_polarity = token_polarity

    # Define a function to get the score multiplier (caused by negation words) for each token in an article
    def get_score_multiplier(self, article):
        # Define the negation scalar (adapted from VADER)
        neg_scalar = -0.74
        # Define a list of ones of the same length as the number of tokens in the article
        score_multiplier = np.repeat(1, len(article)).tolist()
        # Set up a loop to go through all tokens of the article
        for i in np.arange(1, len(article)-1):
            # Check whether the word is a negation word and assign a neg_scalar to the multiplier at the position of the subsequnet token and a 0.5*neg_scalar at the position of the second token after the negation
            if article[i] in self.negations:
                score_multiplier[i+1] = neg_scalar
                if i < (len(article)-2):
                   score_multiplier[i+2] = 0.5*neg_scalar
        # Store the score_multiplier variable in self
        self.score_multiplier = score_multiplier

    # Define a function to get the score adjustment (caused by intensifier words) for each token in an article
    def get_score_adjustment(self, article):
        # Define a list of zeros of the same length as the number of tokens in the article
        score_adjustment = np.repeat(0, len(article)).tolist()
        # Set up a loop to go through all tokens of the article
        for i in np.arange(1, len(article)-1):
            # If the previous word was a verb and the current word is a booster, then assign an adjustment to the verb
            if article[i] in self.booster_dic and self.token_pos[i-1] == 'VERB':
                score_adjustment[i-1] = score_adjustment[i-1] + self.booster_dic[article[i]]
            # Else, it is assumed that the booster affects the subsequent token
            elif article[i] in self.booster_dic and not self.token_pos[i-1] == 'VERB':
                score_adjustment[i+1] = score_adjustment[i+1] + self.booster_dic[article[i]]
        # Store the score_adjustment variable in self
        self.score_adjustment = score_adjustment

    # Define a function to calculate the final polarity of an article
    def get_article_polarity(self):
        # Create an empty list to store the final token polarity (note: from now on only tokens with a polarity != 0 are kept)
        final_token_polarity = []
        # Set up a loop to calculate each token's final polarity score
        for i in range(len(self.token_polarity)):
            if self.token_polarity[i] > 0:
                final_token_polarity.append((self.token_polarity[i] + self.score_adjustment[i])*self.score_multiplier[i])
            if self.token_polarity[i] < 0:
                final_token_polarity.append((self.token_polarity[i] - self.score_adjustment[i])*self.score_multiplier[i])
        # Calculate the article polarity, which is just the average final token polarity among all tokens that were kept (i.e. the tokens that exhibit a non-zero final polarity score)
        # If the list final_token_polarity is empty, then just assign a polarity of 0
        if len(final_token_polarity) == 0:
            document_polarity = 0
        else:
            document_polarity = np.mean(final_token_polarity)
        # Ensure that the resulting polarity is still in the range between -1 and 1
        if document_polarity > 1: document_polarity = 1
        if document_polarity < -1: document_polarity = -1
        # Store the resulting document polarity in self
        self.document_polarity = document_polarity

In [16]:
# Set up a NaiveSentimentClassifierIT object
NSC_it = NaiveSentimentClassifierIT(senti_lex_dict, negations, booster_dic)
# Evaluate the sentiment of the Italian articles
Naive_tx_polarity = NSC_it.evaluate(it_tx, it_idx.it_idx.values.tolist())

Processing time to evaluate the article sentiments: 0.07300450007120768 minutes


In [17]:
# Take a look at the results
Naive_tx_polarity

Unnamed: 0,Naive_polarity
313578,0.254330
460527,0.066695
460528,0.027784
460529,0.375276
460530,0.077931
...,...
2425111,0.325523
2425112,0.290054
2425113,0.139710
2425114,-0.018780


In [18]:
# Take a look at some summary statistics
share_pos = np.round(np.sum(Naive_tx_polarity['Naive_polarity'] > 0) / len(Naive_tx_polarity),2)
share_neg = np.round(np.sum(Naive_tx_polarity['Naive_polarity'] < 0) / len(Naive_tx_polarity),2)
print('The share of articles with a positive sentiment is', 100*share_pos,'%')
print('The share of articles with a negative sentiment is', 100*share_neg,'%')
np.round(Naive_tx_polarity.describe(), 3)

The share of articles with a positive sentiment is 88.0 %
The share of articles with a negative sentiment is 12.0 %


Unnamed: 0,Naive_polarity
count,23621.0
mean,0.209
std,0.198
min,-0.867
25%,0.093
50%,0.21
75%,0.325
max,1.0


In [19]:
# Read the results back in
Naive_tx_polarity = pd.read_csv("Sentiment/Naive/it_naive_polarity.csv", index_col = 0, dtype = {'Naive_polarity': float})

In [20]:
# Take a look at the read in results
Naive_tx_polarity

Unnamed: 0,Naive_polarity
313578,0.254330
460527,0.066695
460528,0.027784
460529,0.375276
460530,0.077931
...,...
2425111,0.325523
2425112,0.290054
2425113,0.139710
2425114,-0.018780


<div class="alert alert-info" style="background-color:#5d3a8e; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'> 
3. Extend the underlying sentiment lexicon with common covid-related words
</h2>
</div>

According to the naïve polarity scores estimated in the previous section most covid related newspaper articles appear to convey a slightly positive sentiment. However, this observation is against our expectations and we therefore try to figure out what might be driving these sentiment classification outcomes to be predominantly positive in the following. Since the self developped naïve sentiment classifier relies on a lexicon based approach, inspecting the underlying sentiment lexicon (i.e. the sentix lexicon) seems a good starting point thereunto. As such, we check whether the following phrases (which - partly intuitively and partly according to the main LDA topic models - are strongly indicative for an article to be covid-related) are captured by the original sentix lexicon:

- 'covid' ($\rightarrow$ 'covid' in Italian)
- 'corona' ($\rightarrow$ 'covid'/'coronavirus' in Italian)
- 'coronavirus' ($\rightarrow$ 'coronavirus' in Italian)
- 'virus' ($\rightarrow$ 'virus' in Italian)
- 'infection' ($\rightarrow$ 'infezione'/'contagio'/'contaminazione' in Italian)
- 'infect' ($\rightarrow$ 'infettare'/'contagiare' in Italian)
- 'infected ($\rightarrow$ 'infettato' in Italian)
- 'infectious' ($\rightarrow$ 'infettivo'/'contagioso'/'virulento' in Italian)
- 'pandemic' ($\rightarrow$ 'pandemia' in Italian)
- 'epidemic' ($\rightarrow$ 'epidemia' in Italian)
- 'lockdown' ($\rightarrow$ 'lockdown'/'coprifuoco' in Italian)
- 'crisis' ($\rightarrow$ 'crisi' in Italian)
- 'quarantine' ($\rightarrow$ 'quarantena'/'contumacia' in Italian)
- 'hospitalisation' ($\rightarrow$ 'ricovero' in Italian)
- 'disease' ($\rightarrow$ 'malattia'/'infermità'/'morbo' in Italian)

Searching these words in the original sentix lexicon reveals that most of them are not contained, meaning that most words in this list are not assigned an appropriate polarity score and therefore do not influence the overall polarity of the articles in which they appear. Since for all words in the above list it can reasonably be argued that they rather should convey a negative feeling, omitting them results in positively biased polarity scores for articles that contain them. Thus, we chase the following strategy to account for this issue: for each word listed above, we check whether the Italian equivalent or close synonyms of it are contained in the original sentix lexicon in a first step. Thus, the following cases may occur:

 - Case 1 - The word itself as well as close synonyms of it are contained: in this case we check whether the polarity score assigned to the focal word is negative and, if yes, leave the entry unchanged. Otherwise, we proceed as described in case 3.

 - Case 2 - The word itself is contained, but no close synonyms of it: in this case we check whether the polarity score assigned to the focal word is negative and, if yes, leave the entry unchanged. Otherwise, we proceed as described in case 4.

 - Case 3 - A close synonym of the word is contained, but the word itself is not: in this case we check whether the polarity score assigned to the synonym is negative and, if yes, assign the same polarity score to the focal word. Otherwise, if the synonym's polarity score is positive, we proceed as described in case 4.

 - Case 4 - Neither the word itself nor close synonyms of it are contained: in this case we start by checking whether the word's primary word (or a synonym's primary word) is contained in the lexicon and, if yes, assign the same polarity score to the focal word. Otherwise, if the primary word is not contained, we continue by checking whether the German/French equivalent of the word is contained in the German/French Vader sentiment lexicon. If yes, we transform the valence score observed there into a polarity score (by dividing it by 4) and assign this score to the focal Italian word. If this approach also fails, we add the word and its synonyms to the lexicon and assign a polarity score of -1 (note: adding grammatical cases (e.g. dative, genitive) or conjugations theoretically is not necessary here, because the Italian articles are lemmatized before they are passed to the sentiment algorithm. However, since the Italian spacy implemenation used for lemmatizing does not seem to always work properly, we decide to add at least the male and female forms of adjectives).

According to this strategy, we apply the following editings to the sentix lexicon:

- 'covid' ($\rightarrow$ 'covid' in Italian): not contained $\rightarrow$ assigned with a polarity score of -1 (added cases: 'covid')
- 'corona' ($\rightarrow$ 'covid'/'coronavirus' in Italian): 'covid' not contained $\rightarrow$ assigned with a polarity score of -1 (added cases: 'covid') / 'coronavirus' not contained $\rightarrow$ assigned with a polarity score of -1 (added cases: 'coronavirus')
- 'coronavirus' ($\rightarrow$ 'coronavirus' in Italian): not contained $\rightarrow$ assigned with a polarity score of -1 (added cases: 'coronavirus')
- 'virus' ($\rightarrow$ 'virus' in Italian): already contained, but multiple entries observed with a polarity scores of either -1 or 1 $\rightarrow$ removed all entries except one which exhibits a polarity score assignment of -1 (note: before this adjustment the polarity assigned to the word 'virus' was 0)
- 'infection' ($\rightarrow$ 'infezione'/'contagio'/'contaminazione' in Italian): 'contagio' already contained with a polarity score of 1 $\rightarrow$ re-assigned with a polarity score of -0.5 as suggested by its French equivalent 'contagion' / 'infezione' already contained with a polarity score of 0 $\rightarrow$ re-assigned with a polarity score of -0.5 since it is a close synonym of 'contagio' / 'contaminazione' not contained $\rightarrow$ assigned with a polarity score of -0.5 since it is a close synonym of 'contagio' (added cases: 'contaminazione')
- 'infect' ($\rightarrow$ 'infettare'/'contagiare' in Italian): 'infettare' already contained with a polarity score of 0.41 $\rightarrow$ re-assigned with a polarity score of -0.5 since 'infezione' is its primary word / 'contagiare' already contained with a polarity score of 0.41 $\rightarrow$ re-assigned with a polarity score of -0.5 since 'contagio' is its primary word
- 'infected ($\rightarrow$ 'infettato' in Italian): not contained $\rightarrow$ assigned with a polarity score of -0.5 since 'infezione' is its primary word (added cases: 'infettato', 'infettata')
- 'infectious' ($\rightarrow$ 'infettivo'/'contagioso'/'virulento' in Italian): 'infettivo' already contained with a polarity score of 1 $\rightarrow$ re-assigned with a polarity score of -0.5 since 'infezione' is its primary word (note: added female case 'infettiva') / 'contagioso' already contained with a polarity score of 1 $\rightarrow$ re-assigned with a polarity score of -0.5 since 'contagio' is its primary word (note: added female case 'contagiosa')/ 'virulento' already contained with a polarity score of 1 $\rightarrow$ re-assigned with a polarity score of -0.5 since it is a close synonym of 'contagioso' (note: added female case 'virulenta')
- 'pandemic' ($\rightarrow$ 'pandemia' in Italian): not contained $\rightarrow$ assigned with a polarity score of -0.675 as suggested by its German equivalent 'Pandemie' (added cases: 'pandemia')
- 'epidemic' ($\rightarrow$ 'epidemia' in Italian): not contained $\rightarrow$ assigned with a polarity score of -0.675 as suggested by its German equivalent 'Epidemie' (added cases: 'epidemia')
- 'lockdown' ($\rightarrow$ 'lockdown'/'coprifuoco' in Italian): 'lockdown' not contained $\rightarrow$ assigned with a polarity score of -1 (added cases: 'lockdown') / 'coprifuoco' not contained $\rightarrow$ assigned with a polarity score of -1 (added cases: 'coprifuoco')
- 'crisis' ($\rightarrow$ 'crisi' in Italian): already contained, but multiple entries observed with a polarity scores of either -1 (2 distinct entries) or -0.25 (1 entry repeated three times) $\rightarrow$ removed all entries except one and re-assigned it with a polarity score of -0.75, which is the average polarity score of the 3 distinct original entries (note: this coincides with the polarity score used by the algorithm before this adjustment was made)
- 'quarantine' ($\rightarrow$ 'quarantena'/'contumacia' in Italian): 'quarantena' already contained with a polarity score of 1 $\rightarrow$ re-assigned with a polarity score of -1 / 'contumacia' not contained $\rightarrow$ assigned with a polarity score of -1 since it is a close synonym of 'quarantena' (added cases: 'contumacia')
- 'hospitalisation' ($\rightarrow$ 'ricovero' in Italian): already contained, but multiple entries observed with a polarity scores of either -1 (1 entry) or 0 (1 entry) $\rightarrow$ removed all entries except one and re-assigned it with a polarity score of -0.5, which is the average polarity score of the 2 distinct original entries (note: this coincides with the polarity score used by the algorithm before this adjustment was made)
- 'disease' ($\rightarrow$ 'malattia'/'infermità'/'morbo' in Italian): 'malattia' already contained, but multiple entries observed with a polarity scores of -1 (4 distinct entries), -0.41 (1 entry repeated 3 times), 0.25 (1 entry repeated 4 times) or 1 (2 distinct entries) $\rightarrow$ removed all entries except one and re-assigned it with a polarity score of -0.27, which is the average polarity score of the 2 distinct original entries (note: this coincides with the polarity score used by the algorithm before this adjustment was made) / 'infermità' already contained with a polarity score of 1 $\rightarrow$ re-assigned with a polarity score of -0.27 since it is a close synonym of 'malattia' / 'morbo' already contained, but multiple entries observed with a polarity scores of either 0.25 (1 entry) or 1 (1 entry) $\rightarrow$ removed all entries except one and re-assigned it with a polarity score of -0.27 since it is a close synonym of 'malattia'

After adjusting the sentiment lexicon as desired we use it to rerun the sentiment analysis. To do so, we have to run the subsequent codes (assuming that at least the first 14 code chunks of this Jupyter notebook have been executed beforehand).

In [21]:
# Read in the manually extended Sentix sentiment lexicon (i.e. an Italian lexicon for sentiment analysis) as a dataframe
senti_lex_df = pd.read_csv("Sentiment/Naive/Italian/sentix_extended.txt", sep = '\t', header = None, names = ['lemma','POS','ID','pos_score','neg_score','polarity','intensity'])
# Lowercase the entries in the column 'lemma'
senti_lex_df['lemma'] = senti_lex_df['lemma'].str.lower()
# Relabel the POS Tags to a common standard
senti_lex_df['POS'].replace({'a': 'ADJ', 'n': 'NOUN', 'v': 'VERB', 'r': 'ADV'}, inplace = True)
# Remove exact duplicates
n_duplicates = sum(senti_lex_df.duplicated())
senti_lex_df.drop_duplicates(keep = 'first', inplace = True, ignore_index = True)
print(n_duplicates, "exactly duplicated entries have been removed.")

17631 exactly duplicated entries have been removed.


In [22]:
# Calculate the average polarity of all duplicated words that are assigned with the same POS
senti_lex_df = senti_lex_df.groupby(['POS','lemma'])['polarity'].mean().reset_index()
# Sort the dataframe according to the alphabetical order of the POS-tags, such that first all ADJs appear, which then are followed by ADVs, NOUNs and then VERBs
senti_lex_df.sort_values(['POS','lemma'], inplace = True)
# Remove duplicates, while the first appearing lemma is kept (hence, in case of duplicated lemmas that have different POS-tags, first adjectives are kept, then adverbs, then nouns and then verbs)
n_duplicates = sum(senti_lex_df.duplicated(subset = ['lemma'], keep = 'first'))
print(n_duplicates, "duplicated lemmas with differing POS-tags have been removed.")
senti_lex_df.drop_duplicates(subset = ['lemma'], keep = 'first', inplace = True, ignore_index = True)

673 duplicated lemmas with differing POS-tags have been removed.


In [23]:
# Take a look at the dataframe containing the sentiment lexicon
senti_lex_df

Unnamed: 0,POS,lemma,polarity
0,ADJ,1,-1.000000
1,ADJ,a_bassa_quota,-1.000000
2,ADJ,a_bassa_risoluzione,1.000000
3,ADJ,a_basso_contenuto_tecnologico,1.000000
4,ADJ,a_basso_livello,-1.000000
...,...,...,...
41806,VERB,voler_bene,0.688083
41807,VERB,volerci,1.000000
41808,VERB,vulnerare,-1.000000
41809,VERB,zoomare,1.000000


In [24]:
# Create a dictionary out of the sentiment lexicon
senti_lex_dict = {}
for index, row in senti_lex_df.iterrows():
    senti_lex_dict[row['lemma']] = {'POS': str(row['POS']), 'polarity': float(row['polarity'])}

In [25]:
# Check whether the newly added words are indeed contained in the lexicon
added_words = ['coronavirus', 'covid', 'virus', 'infezione', 'contagio', 'contaminazione', 'infettare', 'contagiare', 'infettato', 'infettivo', 'contagioso', 
               'virulento', 'pandemia', 'epidemia', 'lockdown', 'coprifuoco', 'crisi', 'quarantena', 'contumacia', 'ricovero', 'malattia', 'infermità', 'morbo']
for word in added_words:
    print(word, ": ", senti_lex_dict[word], sep = '')


coronavirus: {'POS': 'NOUN', 'polarity': -1.0}
covid: {'POS': 'NOUN', 'polarity': -1.0}
virus: {'POS': 'NOUN', 'polarity': -1.0}
infezione: {'POS': 'NOUN', 'polarity': -0.5}
contagio: {'POS': 'NOUN', 'polarity': -0.5}
contaminazione: {'POS': 'NOUN', 'polarity': -0.5}
infettare: {'POS': 'VERB', 'polarity': -0.5}
contagiare: {'POS': 'VERB', 'polarity': -0.5}
infettato: {'POS': 'ADJ', 'polarity': -0.5}
infettivo: {'POS': 'ADJ', 'polarity': -0.5}
contagioso: {'POS': 'ADJ', 'polarity': -0.5}
virulento: {'POS': 'ADJ', 'polarity': -0.5}
pandemia: {'POS': 'NOUN', 'polarity': -0.675}
epidemia: {'POS': 'NOUN', 'polarity': -0.675}
lockdown: {'POS': 'NOUN', 'polarity': -1.0}
coprifuoco: {'POS': 'NOUN', 'polarity': -1.0}
crisi: {'POS': 'NOUN', 'polarity': -0.75}
quarantena: {'POS': 'NOUN', 'polarity': -1.0}
contumacia: {'POS': 'NOUN', 'polarity': -1.0}
ricovero: {'POS': 'NOUN', 'polarity': -0.5}
malattia: {'POS': 'NOUN', 'polarity': -0.27}
infermità: {'POS': 'NOUN', 'polarity': -0.27}
morbo: {'POS'

In [26]:
# Remove unnecessary variables to save RAM
del senti_lex_df

In [27]:
# Set up a NaiveSentimentClassifierIT object
NSC_it = NaiveSentimentClassifierIT(senti_lex_dict, negations, booster_dic)
# Evaluate the sentiment of the Italian articles
Naive_tx_polarity = NSC_it.evaluate(it_tx, it_idx.it_idx.values.tolist(), name_output_file = 'it_naive_polarity_2')

Processing time to evaluate the article sentiments: 0.07138670682907104 minutes


In [28]:
# Take a look at the results
Naive_tx_polarity

Unnamed: 0,Naive_polarity
313578,0.223737
460527,0.066695
460528,0.027784
460529,0.375276
460530,0.077931
...,...
2425111,0.325523
2425112,0.222156
2425113,0.072668
2425114,-0.018780


In [29]:
# Take a look at some summary statistics
share_pos = np.round(np.sum(Naive_tx_polarity['Naive_polarity'] > 0) / len(Naive_tx_polarity),2)
share_neg = np.round(np.sum(Naive_tx_polarity['Naive_polarity'] < 0) / len(Naive_tx_polarity),2)
print('The share of articles with a positive sentiment is', np.round(100*share_pos,2),'%')
print('The share of articles with a negative sentiment is', np.round(100*share_neg,2),'%')
np.round(Naive_tx_polarity.describe(), 3)

The share of articles with a positive sentiment is 86.0 %
The share of articles with a negative sentiment is 14.0 %


Unnamed: 0,Naive_polarity
count,23621.0
mean,0.196
std,0.201
min,-0.867
25%,0.076
50%,0.197
75%,0.315
max,1.0


In [30]:
# Read the results back in
Naive_tx_polarity = pd.read_csv("Sentiment/Naive/it_naive_polarity_2.csv", index_col = 0, dtype = {'Naive_polarity': float})

In [31]:
# Take a look at the read in results
Naive_tx_polarity

Unnamed: 0,Naive_polarity
313578,0.223737
460527,0.066695
460528,0.027784
460529,0.375276
460530,0.077931
...,...
2425111,0.325523
2425112,0.222156
2425113,0.072668
2425114,-0.018780
