<div class="alert alert-info" style="background-color:#5d3a8e; color:white; padding:0px 10px; border-radius:5px;"><h1 style='margin:10px 5px'> 
Master Thesis Yannik Haller - Assigning Labels to General LDA Models
</h1>
</div>

<div class="alert alert-info" style="background-color:#5d3a8e; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'> 
Preparation: load packages and set the appropriate working directory
</h2>
</div>

In [1]:
# Import required baseline packages
import re
import os
import glob
import time
import sys
import pandas as pd
import numpy as np
import datetime

# Change pandas' setting to print out long strings
pd.options.display.max_colwidth = 200

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# Set global parameters for plotting
import matplotlib.pylab as pylab
params = {'legend.fontsize': 10,
          'figure.figsize': (8, 6),
          'axes.labelsize': 14,
          'axes.titlesize': 16,
          'xtick.labelsize': 10,
          'ytick.labelsize': 10}
pylab.rcParams.update(params)

# Regression and smoothing tools
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold

# Disable warnings
import warnings
warnings.filterwarnings("ignore", category = DeprecationWarning)
warnings.filterwarnings("ignore", category = FutureWarning)

In [2]:
# Set the appropriate working directory
os.chdir('D:\\Dropbox\\MA_data')

<div class="alert alert-info" style="background-color:#5d3a8e; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'> 
1. Read in and aggregate/merge the csv-files containing the topic assignments of the General LDA models
</h2>
</div>

In [3]:
# Read in the Dataframe containing the topic assignments and topic affiliation probabilities from the German LDA model
de_topics = pd.read_csv("LDA/Topic_Assignment/de_topic_assignment.csv", index_col = 0, dtype = {'Topic_ID_1': int, 'Affiliation_Prob_1': float, 'Topic_ID_2': float, 'Affiliation_Prob_2': float})

In [4]:
# Create a column to indicate the main topic for each topic ID (for both, the most and second most likely topic)
de_topics['Topic_1'] = np.repeat(None, de_topics.shape[0])
de_topics['Topic_2'] = np.repeat(None, de_topics.shape[0])
## Assign main topics according to the topic IDs
# Most likely topics
de_topics.loc[de_topics.Topic_ID_1.isin([0]),            'Topic_1'] = 'transportation'
de_topics.loc[de_topics.Topic_ID_1.isin([1]),            'Topic_1'] = 'international_news'
de_topics.loc[de_topics.Topic_ID_1.isin([2,7,12,16,19]), 'Topic_1'] = 'inconsequential'
de_topics.loc[de_topics.Topic_ID_1.isin([3,15,20,26]),   'Topic_1'] = 'sports'
de_topics.loc[de_topics.Topic_ID_1.isin([4,5]),          'Topic_1'] = 'culture'
de_topics.loc[de_topics.Topic_ID_1.isin([6,17,18,21]),   'Topic_1'] = 'economy_national'
de_topics.loc[de_topics.Topic_ID_1.isin([8,11]),         'Topic_1'] = 'public_affairs'
de_topics.loc[de_topics.Topic_ID_1.isin([9]),            'Topic_1'] = 'tourism'
de_topics.loc[de_topics.Topic_ID_1.isin([10,27]),        'Topic_1'] = 'politics_national'
de_topics.loc[de_topics.Topic_ID_1.isin([13,29]),        'Topic_1'] = 'law_order'
de_topics.loc[de_topics.Topic_ID_1.isin([14]),           'Topic_1'] = 'economy_international'
de_topics.loc[de_topics.Topic_ID_1.isin([22]),           'Topic_1'] = 'public_health'
de_topics.loc[de_topics.Topic_ID_1.isin([23]),           'Topic_1'] = 'tragedies_crimes'
de_topics.loc[de_topics.Topic_ID_1.isin([24,25,28]),     'Topic_1'] = 'politics_international'
de_topics.loc[de_topics.Topic_ID_1.isin([30]),           'Topic_1'] = 'COVID'
# Second most likely topics
de_topics.loc[de_topics.Topic_ID_2.isin([0]),            'Topic_2'] = 'transportation'
de_topics.loc[de_topics.Topic_ID_2.isin([1]),            'Topic_2'] = 'international_news'
de_topics.loc[de_topics.Topic_ID_2.isin([2,7,12,16,19]), 'Topic_2'] = 'inconsequential'
de_topics.loc[de_topics.Topic_ID_2.isin([3,15,20,26]),   'Topic_2'] = 'sports'
de_topics.loc[de_topics.Topic_ID_2.isin([4,5]),          'Topic_2'] = 'culture'
de_topics.loc[de_topics.Topic_ID_2.isin([6,17,18,21]),   'Topic_2'] = 'economy_national'
de_topics.loc[de_topics.Topic_ID_2.isin([8,11]),         'Topic_2'] = 'public_affairs'
de_topics.loc[de_topics.Topic_ID_2.isin([9]),            'Topic_2'] = 'tourism'
de_topics.loc[de_topics.Topic_ID_2.isin([10,27]),        'Topic_2'] = 'politics_national'
de_topics.loc[de_topics.Topic_ID_2.isin([13,29]),        'Topic_2'] = 'law_order'
de_topics.loc[de_topics.Topic_ID_2.isin([14]),           'Topic_2'] = 'economy_international'
de_topics.loc[de_topics.Topic_ID_2.isin([22]),           'Topic_2'] = 'public_health'
de_topics.loc[de_topics.Topic_ID_2.isin([23]),           'Topic_2'] = 'tragedies_crimes'
de_topics.loc[de_topics.Topic_ID_2.isin([24,25,28]),     'Topic_2'] = 'politics_international'
de_topics.loc[de_topics.Topic_ID_2.isin([30]),           'Topic_2'] = 'COVID'

In [5]:
# Read in the Dataframe containing the topic assignments and topic affiliation probabilities from the French LDA model
fr_topics = pd.read_csv("LDA/Topic_Assignment/fr_topic_assignment.csv", index_col = 0, dtype = {'Topic_ID_1': int, 'Affiliation_Prob_1': float, 'Topic_ID_2': float, 'Affiliation_Prob_2': float})

In [6]:
# Create a column to indicate the main topic for each topic ID (for both, the most and second most likely topic)
fr_topics['Topic_1'] = np.repeat(None, fr_topics.shape[0])
fr_topics['Topic_2'] = np.repeat(None, fr_topics.shape[0])
## Assign main topics according to the topic IDs
# Most likely topics
fr_topics.loc[fr_topics.Topic_ID_1.isin([0]),            'Topic_1'] = 'transportation'
fr_topics.loc[fr_topics.Topic_ID_1.isin([1,4,10]),       'Topic_1'] = 'politics_international'
fr_topics.loc[fr_topics.Topic_ID_1.isin([2,24]),         'Topic_1'] = 'politics_national'
fr_topics.loc[fr_topics.Topic_ID_1.isin([3,6,21,23,25]), 'Topic_1'] = 'inconsequential'
fr_topics.loc[fr_topics.Topic_ID_1.isin([5]),            'Topic_1'] = 'law_order'
fr_topics.loc[fr_topics.Topic_ID_1.isin([7]),            'Topic_1'] = 'economy_international'
fr_topics.loc[fr_topics.Topic_ID_1.isin([8,19]),         'Topic_1'] = 'culture'
fr_topics.loc[fr_topics.Topic_ID_1.isin([9,11]),         'Topic_1'] = 'economy_national'
fr_topics.loc[fr_topics.Topic_ID_1.isin([12]),           'Topic_1'] = 'tourism'
fr_topics.loc[fr_topics.Topic_ID_1.isin([13,14,22]),     'Topic_1'] = 'public_affairs'
fr_topics.loc[fr_topics.Topic_ID_1.isin([15]),           'Topic_1'] = 'public_health'
fr_topics.loc[fr_topics.Topic_ID_1.isin([16]),           'Topic_1'] = 'COVID'
fr_topics.loc[fr_topics.Topic_ID_1.isin([17,26]),        'Topic_1'] = 'sports'
fr_topics.loc[fr_topics.Topic_ID_1.isin([18]),           'Topic_1'] = 'international_news'
fr_topics.loc[fr_topics.Topic_ID_1.isin([20]),           'Topic_1'] = 'tragedies_crimes'
# Second most likely topics
fr_topics.loc[fr_topics.Topic_ID_2.isin([0]),            'Topic_2'] = 'transportation'
fr_topics.loc[fr_topics.Topic_ID_2.isin([1,4,10]),       'Topic_2'] = 'politics_international'
fr_topics.loc[fr_topics.Topic_ID_2.isin([2,24]),         'Topic_2'] = 'politics_national'
fr_topics.loc[fr_topics.Topic_ID_2.isin([3,6,21,23,25]), 'Topic_2'] = 'inconsequential'
fr_topics.loc[fr_topics.Topic_ID_2.isin([5]),            'Topic_2'] = 'law_order'
fr_topics.loc[fr_topics.Topic_ID_2.isin([7]),            'Topic_2'] = 'economy_international'
fr_topics.loc[fr_topics.Topic_ID_2.isin([8,19]),         'Topic_2'] = 'culture'
fr_topics.loc[fr_topics.Topic_ID_2.isin([9,11]),         'Topic_2'] = 'economy_national'
fr_topics.loc[fr_topics.Topic_ID_2.isin([12]),           'Topic_2'] = 'tourism'
fr_topics.loc[fr_topics.Topic_ID_2.isin([13,14,22]),     'Topic_2'] = 'public_affairs'
fr_topics.loc[fr_topics.Topic_ID_2.isin([15]),           'Topic_2'] = 'public_health'
fr_topics.loc[fr_topics.Topic_ID_2.isin([16]),           'Topic_2'] = 'COVID'
fr_topics.loc[fr_topics.Topic_ID_2.isin([17,26]),        'Topic_2'] = 'sports'
fr_topics.loc[fr_topics.Topic_ID_2.isin([18]),           'Topic_2'] = 'international_news'
fr_topics.loc[fr_topics.Topic_ID_2.isin([20]),           'Topic_2'] = 'tragedies_crimes'

In [7]:
# Read in the Dataframe containing the topic assignments and topic affiliation probabilities from the Italian LDA model
it_topics = pd.read_csv("LDA/Topic_Assignment/it_topic_assignment.csv", index_col = 0, dtype = {'Topic_ID_1': int, 'Affiliation_Prob_1': float, 'Topic_ID_2': float, 'Affiliation_Prob_2': float})

In [8]:
# Create a column to indicate the main topic for each topic ID (for both, the most and second most likely topic)
it_topics['Topic_1'] = np.repeat(None, it_topics.shape[0])
it_topics['Topic_2'] = np.repeat(None, it_topics.shape[0])
## Assign main topics according to the topic IDs (note: for Italian artiles there are no topics asigned to the main topics economy or public_health)
# Most likely topics
it_topics.loc[it_topics.Topic_ID_1.isin([0]),                'Topic_1'] = 'politics_international'
it_topics.loc[it_topics.Topic_ID_1.isin([1,18]),             'Topic_1'] = 'sports'
it_topics.loc[it_topics.Topic_ID_1.isin([2,13]),             'Topic_1'] = 'culture'
it_topics.loc[it_topics.Topic_ID_1.isin([3]),                'Topic_1'] = 'transportation'
it_topics.loc[it_topics.Topic_ID_1.isin([4,10,11,12,14,16]), 'Topic_1'] = 'inconsequential'
it_topics.loc[it_topics.Topic_ID_1.isin([5,7]),              'Topic_1'] = 'COVID'
it_topics.loc[it_topics.Topic_ID_1.isin([6,19]),             'Topic_1'] = 'public_affairs'
it_topics.loc[it_topics.Topic_ID_1.isin([8]),                'Topic_1'] = 'politics_national'
it_topics.loc[it_topics.Topic_ID_1.isin([9]),                'Topic_1'] = 'tragedies_crimes'
it_topics.loc[it_topics.Topic_ID_1.isin([15]),               'Topic_1'] = 'tourism'
it_topics.loc[it_topics.Topic_ID_1.isin([17]),               'Topic_1'] = 'law_order'
# Second most likely topics
it_topics.loc[it_topics.Topic_ID_2.isin([0]),                'Topic_2'] = 'politics_international'
it_topics.loc[it_topics.Topic_ID_2.isin([1,18]),             'Topic_2'] = 'sports'
it_topics.loc[it_topics.Topic_ID_2.isin([2,13]),             'Topic_2'] = 'culture'
it_topics.loc[it_topics.Topic_ID_2.isin([3]),                'Topic_2'] = 'transportation'
it_topics.loc[it_topics.Topic_ID_2.isin([4,10,11,12,14,16]), 'Topic_2'] = 'inconsequential'
it_topics.loc[it_topics.Topic_ID_2.isin([5,7]),              'Topic_2'] = 'COVID'
it_topics.loc[it_topics.Topic_ID_2.isin([6,19]),             'Topic_2'] = 'public_affairs'
it_topics.loc[it_topics.Topic_ID_2.isin([8]),                'Topic_2'] = 'politics_national'
it_topics.loc[it_topics.Topic_ID_2.isin([9]),                'Topic_2'] = 'tragedies_crimes'
it_topics.loc[it_topics.Topic_ID_2.isin([15]),               'Topic_2'] = 'tourism'
it_topics.loc[it_topics.Topic_ID_2.isin([17]),               'Topic_2'] = 'law_order'

In [9]:
# Concatenate the above created dataframes (containing the topic assignments of the German, French and Italian articles)
topics = pd.concat([de_topics, fr_topics, it_topics])
# Sort the dataframe by the index
topics.sort_index(inplace = True)
# Take a look at the resulting dataframe
topics

Unnamed: 0,Topic_ID_1,Affiliation_Prob_1,Topic_ID_2,Affiliation_Prob_2,Topic_1,Topic_2
0,1,0.329300,7.0,0.292654,politics_international,economy_international
1,7,0.462487,11.0,0.269677,economy_international,economy_national
2,7,0.325522,11.0,0.232608,economy_international,economy_national
3,7,0.473720,11.0,0.470850,economy_international,economy_national
4,7,0.598861,11.0,0.281546,economy_international,economy_national
...,...,...,...,...,...,...
2441178,17,0.466716,2.0,0.251594,economy_national,inconsequential
2441179,12,0.315961,21.0,0.274781,inconsequential,economy_national
2441180,26,0.377586,29.0,0.296713,sports,law_order
2441181,23,0.433544,13.0,0.272896,tragedies_crimes,law_order


In [10]:
# Check for missing values
topics.isna().sum()

Topic_ID_1                0
Affiliation_Prob_1        0
Topic_ID_2            26824
Affiliation_Prob_2    26824
Topic_1                   0
Topic_2               26824
dtype: int64

In [11]:
# Inspect the datatypes of the columns
topics.dtypes

Topic_ID_1              int32
Affiliation_Prob_1    float64
Topic_ID_2            float64
Affiliation_Prob_2    float64
Topic_1                object
Topic_2                object
dtype: object

In [12]:
# Remove the unnecessary variables to save RAM
del de_topics, fr_topics, it_topics

<div class="alert alert-info" style="background-color:#5d3a8e; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:10px 5px'> 
2. Export and re-read the resulting dataframe
</h2>
</div>

In [13]:
# Export the topics dataframe as a csv file
topics.to_csv("LDA/Topic_Assignment/topic_assignment_general.csv", index = True, encoding = 'utf-8-sig')

In [14]:
# Read in the data as follows
topics = pd.read_csv("LDA/Topic_Assignment/topic_assignment_general.csv", index_col = 0)
# Transform the Topic_ID_2 into an integer type (not possible to read in as integer, due to missing values)
topics['Topic_ID_2'] = topics['Topic_ID_2'].astype("Int32")
# Take a look at the dataframe
topics

Unnamed: 0,Topic_ID_1,Affiliation_Prob_1,Topic_ID_2,Affiliation_Prob_2,Topic_1,Topic_2
0,1,0.329300,7,0.292654,politics_international,economy_international
1,7,0.462487,11,0.269677,economy_international,economy_national
2,7,0.325522,11,0.232608,economy_international,economy_national
3,7,0.473720,11,0.470850,economy_international,economy_national
4,7,0.598861,11,0.281546,economy_international,economy_national
...,...,...,...,...,...,...
2441178,17,0.466716,2,0.251594,economy_national,inconsequential
2441179,12,0.315961,21,0.274781,inconsequential,economy_national
2441180,26,0.377586,29,0.296713,sports,law_order
2441181,23,0.433544,13,0.272896,tragedies_crimes,law_order


In [15]:
# Inspect the datatypes of the columns
topics.dtypes

Topic_ID_1              int64
Affiliation_Prob_1    float64
Topic_ID_2              Int32
Affiliation_Prob_2    float64
Topic_1                object
Topic_2                object
dtype: object