# **Text Mining on Dutch party programmes**

## **Imports**

In [2]:
import os
import matplotlib.pyplot as plt

## **Define paths**

In [1]:
pdf_folder =  'data/vkps/total_vkps'
text_folder = 'data/vkps_texts'
folia_folder = 'data/folias'
plot_directory = "outputs/plots/topics_without_non_thematic"
csv_directory = "outputs/csvs"

## **Convert pdf's to text files**

In [None]:
from parsing.pdf_to_txt import pdf_to_text

files = os.listdir(pdf_folder)
for file_name in files:
    if file_name.endswith('.pdf'):
        pdf_path = os.path.join(pdf_folder, file_name)
        text_path = os.path.join(text_folder, file_name.replace('.pdf', '.txt'))
        pdf_to_text(pdf_path, text_path)
        print(f"Converted {file_name} to text.")


## **Convert text files to folia**

In [4]:
from parsing.txt_to_folia import txt_to_folia
txt_to_folia(text_folder, folia_folder)

looping through text files: 100%|██████████| 3/3 [00:09<00:00,  3.27s/it]


## **Classify sentences**

In [3]:
import os
from classifying.folia_to_classification import process_and_concat_files, calculate_metrics, clean_topics

df = process_and_concat_files(folia_folder)
df = clean_topics(df)
path_df = os.path.join(csv_directory, "parties_sentences_topics.csv")
df.to_csv(path_df)

metrics_df = calculate_metrics(df)
path_metrics_df = os.path.join(csv_directory, "metrics.csv")
metrics_df.to_csv(path_metrics_df)

2023-11-06 11:07:56.541459: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-06 11:07:56.934643: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-06 11:07:56.936390: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
100%|██████████| 2752/2752 [02:59<00:00, 15.31it/s]
100%|██████████| 665/665 [00:43<00:00, 15.41it/s]
100%|██████████| 4842/4842 [04:59<00:00, 16.16it/s]
100%|██████████| 579/579 [00:38<00:00, 15.09it/s]
100%|██████████| 2670/2670 [02:53<00:00, 15.35it/s]
100%|██████████| 2987/2987 [03:04<00:00, 16.17it/s]
100%|██████████| 458/458 [00:34<00:00, 13.22it/s]
100%|██████████| 3089/3089 [03:20<00:00, 15.38it/s]
100%|█

## **Plot party attention profiles**

In [None]:
plt.style.use('ggplot')

parties = metrics_df['party'].unique()

for party in parties:
    party_data = metrics_df[metrics_df['party'] == party]
    party_data_sorted = party_data.sort_values('relative_difference', ascending=True)
    
    fig, ax = plt.subplots(figsize=(10, 8))
    bar_colors = ['#ffb6c1' if x < 0 else '#add8e6' for x in party_data_sorted['relative_difference']]
    bars = ax.barh(party_data_sorted['primary_topic'], party_data_sorted['relative_difference'], color=bar_colors)
    ax.axvline(0, color='black', linewidth=0.8)
    
    ax.set_xlabel('Relative Difference (%)')
    ax.set_title(f'Relative Difference in Topic Attention for {party}')
    ax.spines['right'].set_visible(False)
    ax.spines['top'].set_visible(False)
    ax.spines['left'].set_visible(False)
    ax.grid(True, axis='x', linestyle='--', alpha=0.7)
    ax.text(0.95, 0.01, 'Arjan van Dalfsen',
        verticalalignment='bottom', horizontalalignment='right',
        transform=ax.transAxes, color='grey', fontsize=8, alpha=0.5)

    for bar in bars:
        width = bar.get_width()
        label_x_pos = width if width > 0 else width - 5
        ax.text(label_x_pos, bar.get_y() + bar.get_height()/2, f'{width:.0f}%', va='center')
    
    figure_path = os.path.join(plot_directory, f'{party}.png')
    plt.tight_layout()
    plt.savefig(figure_path, bbox_inches='tight')
    plt.show()

## **Inspect party, topic, sentence**

In [14]:
def export_sentences(df, party_name, topic_name, n_or_frac):
    filtered_df = df[(df['party'] == party_name) & (df['primary_topic'] == topic_name)]
    
    if n_or_frac <= 1:
        n = int(len(filtered_df) * n_or_frac)
    else:
        n = int(n_or_frac)
    
    selected_sentences = filtered_df['sentence'].head(n)
    
    output_text = '\n'.join(selected_sentences)
    
    return output_text

In [None]:
#example
output_text = export_sentences(df, 'NSC', 'Law and Crime', 1)
