<div class="alert alert-info" style="background-color:#5d3a8e; color:white; padding:0px 10px; border-radius:5px;"><h1 style='margin:10px 5px'> 
Master Thesis Yannik Haller - Sentiment Analysis TEXTBLOB
</h1>
</div>

<div class="alert alert-info" style="background-color:#5d3a8e; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'> 
1. Load required packages and the data
</h2>
</div>

In [1]:
# Import required baseline packages
import re
import os
import glob
import time
import sys
import pandas as pd
import numpy as np
from pprint import pprint

# Change pandas' setting to print out long strings
pd.options.display.max_colwidth = 200

# Plotting tools
import matplotlib.pyplot as plt
%matplotlib inline

# TextBlob (for Sentiment Analysis)
from textblob import Blobber
from textblob_de import PatternTagger, PatternAnalyzer

import warnings
warnings.filterwarnings("ignore", category = DeprecationWarning)
warnings.filterwarnings("ignore", category = FutureWarning)

  def _figure_formats_changed(self, name, old, new):


In [2]:
# Set the appropriate working directory
os.chdir('D:\\Dropbox\\MA_data')

In [3]:
# Define a function to read in the fully preprocessed data (note: we are using the preprocessed data in which negations are preserved --> PPII)
def read_preprocessed(language, tokenize = True):
    # Raise an error if an inadmissible language is chosen
    allowed_languages = ['de', 'en', 'fr', 'it']
    if language not in allowed_languages:
        raise ValueError("Invalid language. Expected one of: %s" % allowed_languages)
    
    # Set the appropriate working directory
    os.chdir('D:\\Dropbox\\MA_data')

    # Define the name of the file to load
    filename = "Preprocessed/Sentiment_Analysis/"+language+"_preprocessed_senti.csv"

    # Read in the dataframe containing the text data
    tx_pp = pd.read_csv(filename, index_col = 0, dtype = {'tx': object})

    # Get the articles' index together with an enumeration to identify their position in the list of precleaned articles
    idx = tx_pp.index
    idx = pd.DataFrame(idx, columns = [language+'_idx'])

    # Reduce the dataframe to a list containing the text data
    tx_pp = tx_pp.tx.to_list()

    # Tokenize the data again if tokenize = True (RAM-saving)
    if tokenize:
        tx_pp = retokenize(tx_pp)

    # Return the preprocessed data
    return tx_pp, idx

# Define a function to retokenize the preprocessed text data (RAM-saving)
def retokenize(article_list):
    for i in range(len(article_list)):
        temp_tx = str(article_list[i]).split()
        article_list[i] = temp_tx
    return article_list

In [4]:
# Read in the preprocessed data (not tokenized)
de_tx, de_idx = read_preprocessed('de', tokenize = False)

# Take a look at the size of the precleaned data
sys.getsizeof(de_tx)

15474568

In [5]:
# Take a look at the preprocessed data
de_tx[0]

'rückkehrer stefan meier überragen flames herisau bangen allerdings schluss lukas pfiffnerin vergangen saison tun uhc herisau darin immer wieder rückstand aufholen partie kehren noch jung liga meisterschaft leben team mindestens heimspiel neu trend deutlich führung noch zittern samstag lagen überzeugend ausserrhoder vorne reagieren ausgleich flames tor dreier minute besassen stefan meier saison wasa verteidigen sommer nla stammverein zurückkehren herausragenden stürmer minute sirene hiess fünft mal meier stock spiel sicherheit ball gehen allerdings auch führung verlieren komplette zusammenbruch drohen flames können aber gewicht nicht total verschieben eindrücklich effort niklas hess tragen gastgeber sieg trainer sagen grosses kino schon woche zuvor meier einzig herisauer niederlage pfannenstiel egg treffer erzielen liegen tor assists nun platz skorer gruppe meier bewegen messen cm grösse kg gewicht erstaunlich geschmeidig können ball behaupten weisen wuchtig schuss so laufen aktuell vo

In [6]:
# Take a look at the dataframe containing the according index
de_idx.tail(3)

Unnamed: 0,de_idx
1934310,2441180
1934311,2441181
1934312,2441182


In [7]:
# Retrieve the location of the article in the preprocessed data using the according article id
article_ids = [2441180, 2441181]
location = de_idx[de_idx.de_idx.isin(article_ids)].index.tolist() #1934310

# Access the preprocessed text from the articles with the article ids in [2441180, 2441181]
#list(de_tx[i] for i in location)

# Look at the according location of the articles with the article ids in [2441180, 2441181]
location

[1934310, 1934311]

<div class="alert alert-info" style="background-color:#5d3a8e; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'> 
2. Sentiment Assessment of the Articles
</h2>
</div>

In [8]:
# Define a function that evaluates the polarity of the articles and stores the result to a correctly indexed csv file
def eval_blob_polarity(tx, idx, outputfile_name = 'de_blob_polarity', batchsize = 100000, first_pos = 0):
    # Notes: 
    ## tx has to be a list containing the precleaned and NOT tokenized articles
    ## idx has to be a list containing the correctly ordered index

    # Initialize a Blobber class, which uses the language specific PatternAnalyzer we imported above to assess text polarity (sentiment from -1 to 1) and subjectivity
    tb = Blobber(pos_tagger = PatternTagger(), analyzer = PatternAnalyzer())

    # Set up a loop to go through all articles and evaluate their polarity with TextBlob
    i = first_pos
    i_last_batch = first_pos
    n_articles = len(tx)
    pol = []
    t = time.time()
    for article in tx:
        i = i + 1
        pol.append(tb(article).sentiment[0])
        if i % batchsize == 0 and i != (n_articles-first_pos):
            print("Processing time to evaluate polarity scores of the articles at positions", i_last_batch, "to", i-1, ":", str(round((time.time() - t)/60,2)), "minutes")
            i_last_batch = i
            t = time.time()
        if i == (n_articles-first_pos):
            print("Processing time to evaluate polarity scores of the articles at positions", i_last_batch, "to", i-1, ":", str(round((time.time() - t)/60,2)), "minutes")
            print("DONE! ;)")

    # Create a correctly indexed dataframe
    Blob_tx_polarity = pd.DataFrame(pol, index = idx, columns = ['Blob_polarity'])
    # Save the results to a csv file
    Blob_tx_polarity.to_csv("Sentiment/TextBlob/"+outputfile_name+".csv", index = True)
    # Return the results
    return Blob_tx_polarity

In [9]:
# Apply the previously defined function on the first half of the German articles
Blob_tx_polarity_1 = eval_blob_polarity(de_tx[:1000000], de_idx.de_idx.values.tolist()[:1000000], 'de_blob_polarity_batch1', 100000, 0)

Processing time to evaluate polarity scores of the articles at positions 0 to 99999: 72.63 minutes
Processing time to evaluate polarity scores of the articles at positions 100000 to 199999: 71.35 minutes
Processing time to evaluate polarity scores of the articles at positions 200000 to 299999: 73.22 minutes
Processing time to evaluate polarity scores of the articles at positions 300000 to 399999: 75.01 minutes
Processing time to evaluate polarity scores of the articles at positions 400000 to 499999: 67.63 minutes
Processing time to evaluate polarity scores of the articles at positions 500000 to 599999: 84.44 minutes
Processing time to evaluate polarity scores of the articles at positions 600000 to 699999: 97.67 minutes
Processing time to evaluate polarity scores of the articles at positions 700000 to 799999: 111.3 minutes
Processing time to evaluate polarity scores of the articles at positions 800000 to 899999: 97.98 minutes
Processing time to evaluate polarity scores of the articles a

In [10]:
# Apply the previously defined function on the second half of the German articles
Blob_tx_polarity_2 = eval_blob_polarity(de_tx[1000000:], de_idx.de_idx.values.tolist()[1000000:], 'de_blob_polarity_batch2', 100000, 1000000)

Processing time to evaluate polarity scores of the articles at positions 1000000 to 1099999 : 73.73 minutes
Processing time to evaluate polarity scores of the articles at positions 1100000 to 1199999 : 77.15 minutes
Processing time to evaluate polarity scores of the articles at positions 1200000 to 1299999 : 71.26 minutes
Processing time to evaluate polarity scores of the articles at positions 1300000 to 1399999 : 75.04 minutes
Processing time to evaluate polarity scores of the articles at positions 1400000 to 1499999 : 68.16 minutes
Processing time to evaluate polarity scores of the articles at positions 1500000 to 1599999 : 64.96 minutes
Processing time to evaluate polarity scores of the articles at positions 1600000 to 1699999 : 60.07 minutes
Processing time to evaluate polarity scores of the articles at positions 1700000 to 1799999 : 85.88 minutes
Processing time to evaluate polarity scores of the articles at positions 1800000 to 1899999 : 84.49 minutes
Processing time to evaluate 

In [11]:
# Read the results back in, concatenate them to one dataframe and save it as a csv file
filenames = ["Sentiment/TextBlob/de_blob_polarity_batch1.csv", "Sentiment/TextBlob/de_blob_polarity_batch2.csv"]
Blob_tx_polarity = pd.concat([pd.read_csv(f, index_col = 0, dtype = {'Blob_polarity': float}) for f in filenames])
Blob_tx_polarity.to_csv("Sentiment/TextBlob/de_blob_polarity.csv", index = True)

In [12]:
# Take a look at the results
Blob_tx_polarity

Unnamed: 0,Blob_polarity
16553,0.310417
16554,0.420000
16555,0.850000
16556,1.000000
16557,0.488889
...,...
2441178,-0.100000
2441179,0.121429
2441180,0.350000
2441181,-0.433333


In [13]:
# Read the concatenated results back in
Blob_tx_polarity = pd.read_csv("Sentiment/TextBlob/de_blob_polarity.csv", index_col = 0, dtype = {'Blob_polarity': float})

In [14]:
# Take a look at the read in results
Blob_tx_polarity

Unnamed: 0,Blob_polarity
16553,0.310417
16554,0.420000
16555,0.850000
16556,1.000000
16557,0.488889
...,...
2441178,-0.100000
2441179,0.121429
2441180,0.350000
2441181,-0.433333


In [15]:
# Take a look at some summary statistics
share_pos = np.round(np.sum(Blob_tx_polarity['Blob_polarity'] > 0) / len(Blob_tx_polarity),2)
share_neg = np.round(np.sum(Blob_tx_polarity['Blob_polarity'] < 0) / len(Blob_tx_polarity),2)
print('The share of articles with a positive sentiment is', 100*share_pos,'%')
print('The share of articles with a negative sentiment is', 100*share_neg,'%')
np.round(Blob_tx_polarity.describe(), 3)

The share of articles with a positive sentiment is 69.0 %
The share of articles with a negative sentiment is 27.0 %


Unnamed: 0,Blob_polarity
count,1934313.0
mean,0.177
std,0.401
min,-1.0
25%,-0.028
50%,0.198
75%,0.434
max,1.0
