# Setup

### Load Packages

In [1]:
# Import Packages and Model Commands
from transformers import BertForSequenceClassification, BertTokenizerFast
import torch
import pandas as pd
import nltk
from tqdm import tqdm
import datetime
import sys
from nltk.corpus import stopwords
from locale import setlocale, LC_TIME # Swedish time/date system
import numpy as np

setlocale(LC_TIME, 'sv_SE') # Set the locale to Swedish

# Swedish stopwords
nltk.download('stopwords')
stop_words_swedish = set(stopwords.words('swedish'))

# Download the NLTK sentence tokenizer data
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ryanh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ryanh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Sentiment Models

In [2]:
# Load pre-trained model and tokenizer from Recorded Future
tokenizer = BertTokenizerFast.from_pretrained("RecordedFuture/Swedish-Sentiment-Fear")
classifier_fear= BertForSequenceClassification.from_pretrained("RecordedFuture/Swedish-Sentiment-Fear")
classifier_violence = BertForSequenceClassification.from_pretrained("RecordedFuture/Swedish-Sentiment-Violence")

### Load Data

In [3]:
# Load Unclassified Data
unclassified = pd.read_csv("unclassified_chunk_ae.csv")

# Modify Unclassified Data

In [4]:
# Rename Unnamed ID Column
unclassified.rename(columns={'Unnamed: 0': 'Index'}, inplace=True)

### Set Up Dates

In [5]:
# Reformat Date
unclassified['Date'] = pd.to_datetime(unclassified['Date'], format='%d %B %Y')

In [6]:
# Create New 'Year' Column
unclassified['Year'] = unclassified['Date'].dt.year.fillna(0).astype(int)

# Create New 'Month' Column
unclassified['Month'] = unclassified['Date'].dt.month.fillna(0).astype(int)

In [7]:
# Isolate data from 2010 on
unclassified_decade = unclassified[unclassified['Year'] >= 2010]

### Subset Original Data by Time-Relevant Data

In [8]:
# Create dataframes with the time-relevant data and old date format
unprocessed_data = unclassified[unclassified['Index'].isin(unclassified_decade['Index'])]

In [9]:
# Remove NO CONTENT Columns
unprocessed_data = unprocessed_data[unprocessed_data['Text'] != 'NO CONTENT']

In [10]:
# Drop Month and Year Columns
unprocessed_data.drop(columns=['Month', 'Year'], inplace=True)

In [11]:
# Check
unprocessed_data.head(1)

Unnamed: 0,Index,Date,Document Type,Source,PDF Indicator,Text,Content Page,PDF Link
177,22785,2010-01-26,"Lagrådsremiss,Rättsliga dokument",Justitiedepartementet,1,Lagrådsremiss 1 En ny fängelse- och hä...,https://www.regeringen.se/rattsliga-dokument/l...,https://www.regeringen.se/contentassets/3f4a77...


In [12]:
len(unprocessed_data)

1602

### Split unprocessed_data into 6 Chunks

In [13]:
# Determine the number of parts
num_parts = 6

# Calculate the number of rows in each part
rows_per_part = len(unprocessed_data) // num_parts

# Use numpy array_split to split the DataFrame
split_dataframes = np.array_split(unprocessed_data, num_parts)

# Access each part using index
part1, part2, part3, part4, part5, part6 = split_dataframes

# Sentence Sentiment Analysis


### Cycle Through Data Parts

In [14]:
# Create Backup for unprocessed_data
unprocessed_data_backup = unprocessed_data.copy()

In [74]:
# Each Part is Processed Separately
#unprocessed_data = part1 # Complete
# unprocessed_data = part2 # Complete
# unprocessed_data = part3 # Complete
unprocessed_data = part4
# unprocessed_data = part5
# unprocessed_data = part6

In [75]:
# Function to handle errors during processing
def process_text_with_fail_safes(text, text_id):
    try:
        sentences = nltk.sent_tokenize(text)

        for sentence in sentences:
            inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512)

            # Calculate probabilities for classifier_fear
            outputs_fear = classifier_fear(**inputs)
            probabilities_fear = torch.nn.functional.softmax(outputs_fear.logits, dim=1).tolist()[0]

            # Calculate probabilities for classifier_violence
            outputs_violence = classifier_violence(**inputs)
            probabilities_violence = torch.nn.functional.softmax(outputs_violence.logits, dim=1).tolist()[0]

            texts_list.append(sentence)
            ids_list.append(text_id)

            probabilities_fear_list.append(probabilities_fear)
            probabilities_violence_list.append(probabilities_violence)

    except Exception as e:
        print(f"Error processing text with ID {text_id}: {e}")
        # If there's an error, replace probabilities with 0
        probabilities_fear_list.append([0, 0, 0])
        probabilities_violence_list.append([0, 0, 0])
        texts_list.append("Error processing text")
        ids_list.append(text_id)

In [76]:
# Lists to store probabilities, texts, and IDs for both classifiers
probabilities_fear_list = []
probabilities_violence_list = []
texts_list = []
ids_list = []

In [77]:
for text, text_id in tqdm(zip(unprocessed_data['Text'].tolist(), unprocessed_data['Index'].tolist()),
                          total=len(unprocessed_data), desc="Processing Texts"):
    process_text_with_fail_safes(text, text_id)

Processing Texts:   0%|          | 0/267 [00:00<?, ?it/s]

Processing Texts:  64%|██████▍   | 172/267 [9:01:46<4:21:40, 165.27s/it]  

Error processing text with ID 24375: expected string or bytes-like object


Processing Texts:  67%|██████▋   | 180/267 [9:24:28<5:40:12, 234.63s/it]

Error processing text with ID 24383: expected string or bytes-like object


Processing Texts:  69%|██████▊   | 183/267 [9:24:39<2:18:54, 99.22s/it] 

Error processing text with ID 24386: expected string or bytes-like object


Processing Texts:  91%|█████████ | 243/267 [20:03:20<13:36:39, 2041.64s/it]

Error processing text with ID 24848: expected string or bytes-like object
Error processing text with ID 24849: expected string or bytes-like object


Processing Texts: 100%|██████████| 267/267 [21:56:07<00:00, 295.76s/it]    


In [78]:
# Creating the classified_sentence_data DataFrame
classified_sentence_data = pd.DataFrame({
    'Fear Class 0 Probability': [item[0] for item in probabilities_fear_list],
    'Fear Class 1 Probability': [item[1] for item in probabilities_fear_list],
    'Fear Class 2 Probability': [item[2] for item in probabilities_fear_list],
    'Violence Class 0 Probability': [item[0] for item in probabilities_violence_list],
    'Violence Class 1 Probability': [item[1] for item in probabilities_violence_list],
    'Violence Class 2 Probability': [item[2] for item in probabilities_violence_list],
    'Text': texts_list,
    'ID': ids_list
})

In [79]:
# Creating the classified_sentence_data DataFrame
classified_sentence_data = pd.DataFrame({
    'Fear Class 0 Probability': [item[0] for item in probabilities_fear_list],
    'Fear Class 1 Probability': [item[1] for item in probabilities_fear_list],
    'Fear Class 2 Probability': [item[2] for item in probabilities_fear_list],
    'Violence Class 0 Probability': [item[0] for item in probabilities_violence_list],
    'Violence Class 1 Probability': [item[1] for item in probabilities_violence_list],
    'Violence Class 2 Probability': [item[2] for item in probabilities_violence_list],
    'Text': texts_list,
    'ID': ids_list
})

# Merging additional columns from unprocessed_data based on 'Index' and 'ID'
classified_sentence_data = pd.merge(classified_sentence_data, unprocessed_data[['Index', 'Date', 'Document Type', 'Source', 'PDF Indicator', 'Content Page', 'PDF Link']], 
                                    left_on='ID', right_on='Index', how='left')

In [80]:
# Dropping the redundant 'Index' column
classified_sentence_data.drop(columns=['Index'], inplace=True)

In [81]:
# Reordering the columns
classified_sentence_data = classified_sentence_data[['ID', 'Date',
                                                     'Fear Class 0 Probability', 'Fear Class 1 Probability', 'Fear Class 2 Probability',
                                                     'Violence Class 0 Probability', 'Violence Class 1 Probability', 'Violence Class 2 Probability',
                                                     'Document Type', 'Source', 'PDF Indicator', 'Text', 'Content Page', 'PDF Link']]

In [82]:
# Group result_df by 'ID'
classified_sentence_data_grouped = classified_sentence_data.groupby('ID')

# Lists to store data for the new DataFrame for fear probabilities
new_ids = []
new_texts = []
new_fear_class_0_probs = []
new_fear_class_1_probs = []
new_fear_class_2_probs = []
new_violence_class_0_probs = []
new_violence_class_1_probs = []
new_violence_class_2_probs = []

# Fear Classification
# Iterate through grouped_df
for id, group in tqdm(classified_sentence_data_grouped, desc="Processing Groups"):
    num_rows = len(group)
    # If there are less than 10 rows, select the row with the lowest Class 0 probability
    if num_rows < 10:
        min_index = group['Fear Class 0 Probability'].idxmin()
        selected_row = group.loc[min_index]
        new_ids.append(id)
        new_texts.append(selected_row['Text'])
        new_fear_class_0_probs.append(selected_row['Fear Class 0 Probability'])
        new_fear_class_1_probs.append(selected_row['Fear Class 1 Probability'])
        new_fear_class_2_probs.append(selected_row['Fear Class 2 Probability'])
    # If there are 10-49 rows, select the two rows with the lowest Class 0 probabilities
    elif 10 <= num_rows < 50:
        selected_rows = group.nsmallest(2, 'Fear Class 0 Probability')
        avg_class_0_prob = selected_rows['Fear Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Fear Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Fear Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_fear_class_0_probs.append(avg_class_0_prob)
        new_fear_class_1_probs.append(avg_class_1_prob)
        new_fear_class_2_probs.append(avg_class_2_prob)
    # If there are 50-99 rows, select the three rows with the lowest Class 0 probabilities
    elif 50 <= num_rows < 100:
        selected_rows = group.nsmallest(3, 'Fear Class 0 Probability')
        avg_class_0_prob = selected_rows['Fear Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Fear Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Fear Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_fear_class_0_probs.append(avg_class_0_prob)
        new_fear_class_1_probs.append(avg_class_1_prob)
        new_fear_class_2_probs.append(avg_class_2_prob)
    # If there are 100-199 rows, select the four rows with the lowest Class 0 probabilities
    elif 100 <= num_rows < 200:
        selected_rows = group.nsmallest(4, 'Fear Class 0 Probability')
        avg_class_0_prob = selected_rows['Fear Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Fear Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Fear Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_fear_class_0_probs.append(avg_class_0_prob)
        new_fear_class_1_probs.append(avg_class_1_prob)
        new_fear_class_2_probs.append(avg_class_2_prob)
    # If there are 200 or more rows, select the five rows with the lowest Class 0 probabilities
    else:
        selected_rows = group.nsmallest(5, 'Fear Class 0 Probability')
        avg_class_0_prob = selected_rows['Fear Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Fear Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Fear Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_fear_class_0_probs.append(avg_class_0_prob)
        new_fear_class_1_probs.append(avg_class_1_prob)
        new_fear_class_2_probs.append(avg_class_2_prob)

Processing Groups:   0%|          | 0/267 [00:00<?, ?it/s]

Processing Groups: 100%|██████████| 267/267 [00:00<00:00, 433.34it/s]


In [83]:
# Lists to store data for the new DataFrame for violence probabilities
new_ids = []
new_texts = []
new_violence_class_0_probs = []
new_violence_class_1_probs = []
new_violence_class_2_probs = []

# Violence Classification
# Iterate through grouped_df
for id, group in tqdm(classified_sentence_data_grouped, desc="Processing Groups"):
    num_rows = len(group)
    # If there are less than 10 rows, select the row with the lowest Class 0 probability
    if num_rows < 10:
        min_index = group['Violence Class 0 Probability'].idxmin()
        selected_row = group.loc[min_index]
        new_ids.append(id)
        new_texts.append(selected_row['Text'])
        new_violence_class_0_probs.append(selected_row['Violence Class 0 Probability'])
        new_violence_class_1_probs.append(selected_row['Violence Class 1 Probability'])
        new_violence_class_2_probs.append(selected_row['Violence Class 2 Probability'])
    # If there are 10-49 rows, select the two rows with the lowest Class 0 probabilities
    elif 10 <= num_rows < 50:
        selected_rows = group.nsmallest(2, 'Violence Class 0 Probability')
        avg_class_0_prob = selected_rows['Violence Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Violence Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Violence Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_violence_class_0_probs.append(avg_class_0_prob)
        new_violence_class_1_probs.append(avg_class_1_prob)
        new_violence_class_2_probs.append(avg_class_2_prob)
    # If there are 50-99 rows, select the three rows with the lowest Class 0 probabilities
    elif 50 <= num_rows < 100:
        selected_rows = group.nsmallest(3, 'Violence Class 0 Probability')
        avg_class_0_prob = selected_rows['Violence Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Violence Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Violence Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_violence_class_0_probs.append(avg_class_0_prob)
        new_violence_class_1_probs.append(avg_class_1_prob)
        new_violence_class_2_probs.append(avg_class_2_prob)
    # If there are 100-199 rows, select the four rows with the lowest Class 0 probabilities
    elif 100 <= num_rows < 200:
        selected_rows = group.nsmallest(4, 'Violence Class 0 Probability')
        avg_class_0_prob = selected_rows['Violence Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Violence Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Violence Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_violence_class_0_probs.append(avg_class_0_prob)
        new_violence_class_1_probs.append(avg_class_1_prob)
        new_violence_class_2_probs.append(avg_class_2_prob)
    # If there are 200 or more rows, select the five rows with the lowest Class 0 probabilities
    else:
        selected_rows = group.nsmallest(5, 'Violence Class 0 Probability')
        avg_class_0_prob = selected_rows['Violence Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Violence Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Violence Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_violence_class_0_probs.append(avg_class_0_prob)
        new_violence_class_1_probs.append(avg_class_1_prob)
        new_violence_class_2_probs.append(avg_class_2_prob)

Processing Groups: 100%|██████████| 267/267 [00:00<00:00, 476.47it/s]


### FIX MISMATCH IN ARRAY LENGTHS

In [84]:
# Find entries in unprocessed_data['Index'] but not in new_ids
# entries_not_in_new_ids = unprocessed_data[~unprocessed_data['Index'].isin(new_ids)]
# entries_not_in_new_ids

In [85]:
# Subset unprocessed_data so it doesn't include the problematic row(s)
# unprocessed_data = unprocessed_data[~unprocessed_data['Index'].isin(entries_not_in_new_ids['Index'])]

In [86]:
# Create a new DataFrame containing the original unprocessed data but with the representative classification scores
processed_data = pd.DataFrame({
    'ID': new_ids,
    'Date': unprocessed_data['Date'],
    'Fear Class 0 Probability': new_fear_class_0_probs,
    'Fear Class 1 Probability': new_fear_class_1_probs,
    'Fear Class 2 Probability': new_fear_class_2_probs,
    'Violence Class 0 Probability': new_violence_class_0_probs,
    'Violence Class 1 Probability': new_violence_class_1_probs,
    'Violence Class 2 Probability': new_violence_class_2_probs,
    'Document Type': unprocessed_data['Document Type'],
    'Source': unprocessed_data['Source'],
    'Text': unprocessed_data['Text'],
    'PDF Indicator': unprocessed_data['PDF Indicator'],
    'Content Page': unprocessed_data['Content Page'],
    'PDF Link': unprocessed_data['PDF Link']
})

In [87]:
print("Length of new_ids:", len(new_ids))
print("Length of new_fear_class_0_probs:", len(new_fear_class_0_probs))
print("Length of new_fear_class_1_probs:", len(new_fear_class_1_probs))
print("Length of new_fear_class_2_probs:", len(new_fear_class_2_probs))
print("Length of new_violence_class_0_probs:", len(new_violence_class_0_probs))
print("Length of new_violence_class_1_probs:", len(new_violence_class_1_probs))
print("Length of new_violence_class_2_probs:", len(new_violence_class_2_probs))

# Check the length of other arrays as well

print("Length of unprocessed_data['Date']:", len(unprocessed_data['Date']))
print("Length of unprocessed_data['Document Type']:", len(unprocessed_data['Document Type']))
print("Length of unprocessed_data['Source']:", len(unprocessed_data['Source']))
print("Length of unprocessed_data['Text']:", len(unprocessed_data['Text']))
print("Length of unprocessed_data['PDF Indicator']:", len(unprocessed_data['PDF Indicator']))
print("Length of unprocessed_data['Content Page']:", len(unprocessed_data['Content Page']))
print("Length of unprocessed_data['PDF Link']:", len(unprocessed_data['PDF Link']))

Length of new_ids: 267
Length of new_fear_class_0_probs: 267
Length of new_fear_class_1_probs: 267
Length of new_fear_class_2_probs: 267
Length of new_violence_class_0_probs: 267
Length of new_violence_class_1_probs: 267
Length of new_violence_class_2_probs: 267
Length of unprocessed_data['Date']: 267
Length of unprocessed_data['Document Type']: 267
Length of unprocessed_data['Source']: 267
Length of unprocessed_data['Text']: 267
Length of unprocessed_data['PDF Indicator']: 267
Length of unprocessed_data['Content Page']: 267
Length of unprocessed_data['PDF Link']: 267


In [88]:
processed_data

Unnamed: 0,ID,Date,Fear Class 0 Probability,Fear Class 1 Probability,Fear Class 2 Probability,Violence Class 0 Probability,Violence Class 1 Probability,Violence Class 2 Probability,Document Type,Source,Text,PDF Indicator,Content Page,PDF Link
1284,23892,2015-01-19,0.997124,0.001916,0.000959,0.997205,0.001014,0.001781,"Kommittédirektiv,Rättsliga dokument","Finansdepartementet,Regeringen",Kommittédirektiv Tilläggsdirektiv till Ä...,1,https://www.regeringen.se/rattsliga-dokument/k...,https://www.regeringen.se/contentassets/f92423...
1289,23897,2010-12-09,0.926720,0.068160,0.005120,0.998537,0.000505,0.000957,Regeringsuppdrag,Finansdepartementet,Regeringsbeslut II 11 Fi2010...,1,https://www.regeringen.se/regeringsuppdrag/201...,https://www.regeringen.se/contentassets/ff076f...
1363,23971,2010-03-17,0.970289,0.027116,0.002595,0.997199,0.000993,0.001808,"Rättsliga dokument,Skrivelse",Finansdepartementet,Regeringens skrivelse 2009/10:153 Redov...,1,https://www.regeringen.se/rattsliga-dokument/s...,https://www.regeringen.se/contentassets/b3d401...
1364,23972,2010-04-15,0.985701,0.011494,0.002805,0.996342,0.001223,0.002435,"Rättsliga dokument,Skrivelse",Finansdepartementet,Regeringens skrivelse 2009/10:195 R...,1,https://www.regeringen.se/rattsliga-dokument/s...,https://www.regeringen.se/contentassets/5e13e6...
1365,23973,2010-04-15,0.841206,0.155265,0.003529,0.996111,0.001277,0.002612,"Rättsliga dokument,Skrivelse",Finansdepartementet,1 Regeringens skrivelse 2009/10:102 Ut...,1,https://www.regeringen.se/rattsliga-dokument/s...,https://www.regeringen.se/contentassets/0e73f3...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2489,25097,2010-10-14,0.986507,0.011598,0.001894,0.995839,0.001434,0.002726,"Lagrådsremiss,Rättsliga dokument",Finansdepartementet,Lagrådsremiss Förfaranderegler för altern...,1,https://www.regeringen.se/rattsliga-dokument/l...,https://www.regeringen.se/contentassets/b9bc47...
2490,25098,2010-10-15,0.987704,0.010358,0.001938,0.994348,0.002070,0.003582,"Lagrådsremiss,Rättsliga dokument",Finansdepartementet,Lagrådsremiss 1 Vissa tekniska mervärdessk...,1,https://www.regeringen.se/rattsliga-dokument/l...,https://www.regeringen.se/contentassets/93766c...
2491,25099,2010-11-11,0.987210,0.011505,0.001284,0.997094,0.000960,0.001946,"Lagrådsremiss,Rättsliga dokument",Finansdepartementet,Lagrådsremiss 1 Ändrade bestämmelser o...,1,https://www.regeringen.se/rattsliga-dokument/l...,https://www.regeringen.se/contentassets/4b4015...
2492,25100,2010-11-19,0.978293,0.017912,0.003796,0.995808,0.001422,0.002770,"Lagrådsremiss,Rättsliga dokument",Finansdepartementet,Lagrådsremiss 1 Skattefrihet för alkol...,1,https://www.regeringen.se/rattsliga-dokument/l...,https://www.regeringen.se/contentassets/667b17...


In [89]:
# STEP 6: CLASSIFY ARTICLES AS A WHOLE WITH NO STOPWORDS

# Lists to store probabilities, texts, and IDs for both classifiers
probabilities_fear_list_2 = []
probabilities_violence_list_2 = []
texts_list_2 = []
ids_list_2 = []

text_entries_2 = unprocessed_data['Text'].tolist()
ids_2 = unprocessed_data['Index'].tolist()

for text, text_id in tqdm(zip(text_entries_2, ids_2), total=len(text_entries_2), desc="Processing Texts"):
    try:
        # Lowercase the entire text
        text_lower = text.lower()

        # Tokenize and remove stop words
        tokens = nltk.word_tokenize(text_lower)
        tokens_filtered = [word for word in tokens if word.isalnum() and word not in stop_words_swedish]

        # Reconstruct the text
        text_filtered = ' '.join(tokens_filtered)

        # Process the entire text at once
        inputs = tokenizer(text_filtered, return_tensors="pt", truncation=True, max_length=512)

        # Calculate probabilities for classifier_fear
        outputs_fear = classifier_fear(**inputs)
        probabilities_fear = torch.nn.functional.softmax(outputs_fear.logits, dim=1).tolist()[0]
        probabilities_fear_list_2.append(probabilities_fear)

        # Calculate probabilities for classifier_violence
        outputs_violence = classifier_violence(**inputs)
        probabilities_violence = torch.nn.functional.softmax(outputs_violence.logits, dim=1).tolist()[0]
        probabilities_violence_list_2.append(probabilities_violence)

        texts_list_2.append(text_filtered)
        ids_list_2.append(text_id)
    except Exception as e:
        # If an error occurs, print the error and append zeros to the probability lists
        print(f"Error processing text with ID {text_id}: {e}")
        probabilities_fear_list_2.append([0.0, 0.0, 0.0])
        probabilities_violence_list_2.append([0.0, 0.0, 0.0])
        texts_list_2.append("Error processing text")
        ids_list_2.append(text_id)

# Now, probabilities_fear_list_2 and probabilities_violence_list_2 contain probabilities
# for classifier_fear and classifier_violence respectively, for each text.

# Creating the classified_sentence_data DataFrame
classified_sentence_data_2 = pd.DataFrame({
    'Fear Class 0 Probability_2': [item[0] for item in probabilities_fear_list_2],
    'Fear Class 1 Probability_2': [item[1] for item in probabilities_fear_list_2],
    'Fear Class 2 Probability_2': [item[2] for item in probabilities_fear_list_2],
    'Violence Class 0 Probability_2': [item[0] for item in probabilities_violence_list_2],
    'Violence Class 1 Probability_2': [item[1] for item in probabilities_violence_list_2],
    'Violence Class 2 Probability_2': [item[2] for item in probabilities_violence_list_2],
    'Text_2': texts_list_2,
    'ID_2': ids_list_2
})


Processing Texts:  64%|██████▍   | 172/267 [06:39<03:53,  2.46s/it]

Error processing text with ID 24375: 'float' object has no attribute 'lower'


Processing Texts:  67%|██████▋   | 180/267 [07:05<05:03,  3.49s/it]

Error processing text with ID 24383: 'float' object has no attribute 'lower'


Processing Texts:  69%|██████▊   | 183/267 [07:08<02:51,  2.04s/it]

Error processing text with ID 24386: 'float' object has no attribute 'lower'


Processing Texts:  91%|█████████ | 243/267 [09:19<00:48,  2.03s/it]

Error processing text with ID 24848: 'float' object has no attribute 'lower'
Error processing text with ID 24849: 'float' object has no attribute 'lower'


Processing Texts: 100%|██████████| 267/267 [10:18<00:00,  2.32s/it]


In [90]:
# STEP 7: MERGE DATA

# Columns to be joined from classified_sentence_data_2
columns_to_join = ['ID_2', 'Fear Class 0 Probability_2', 'Fear Class 1 Probability_2', 'Fear Class 2 Probability_2',
                    'Violence Class 0 Probability_2', 'Violence Class 1 Probability_2', 'Violence Class 2 Probability_2']

# Merge processed_data with the selected columns from classified_sentence_data_2
fully_classified_data = pd.merge(processed_data, classified_sentence_data_2[columns_to_join], how='left', left_on='ID', right_on='ID_2')

# Drop the redundant 'ID_2' column
fully_classified_data.drop(columns=['ID_2'], inplace=True)

In [91]:
# SAVE
fully_classified_data.to_csv('part4.csv', index=False)

### Test Data

In [92]:
# Load Unclassified Data
test = pd.read_csv("part4.csv")

In [93]:
test

Unnamed: 0,ID,Date,Fear Class 0 Probability,Fear Class 1 Probability,Fear Class 2 Probability,Violence Class 0 Probability,Violence Class 1 Probability,Violence Class 2 Probability,Document Type,Source,Text,PDF Indicator,Content Page,PDF Link,Fear Class 0 Probability_2,Fear Class 1 Probability_2,Fear Class 2 Probability_2,Violence Class 0 Probability_2,Violence Class 1 Probability_2,Violence Class 2 Probability_2
0,23892,2015-01-19,0.997124,0.001916,0.000959,0.997205,0.001014,0.001781,"Kommittédirektiv,Rättsliga dokument","Finansdepartementet,Regeringen",Kommittédirektiv Tilläggsdirektiv till Ä...,1,https://www.regeringen.se/rattsliga-dokument/k...,https://www.regeringen.se/contentassets/f92423...,0.995913,0.002912,0.001176,0.999380,0.000277,0.000343
1,23897,2010-12-09,0.926720,0.068160,0.005120,0.998537,0.000505,0.000957,Regeringsuppdrag,Finansdepartementet,Regeringsbeslut II 11 Fi2010...,1,https://www.regeringen.se/regeringsuppdrag/201...,https://www.regeringen.se/contentassets/ff076f...,0.975695,0.019489,0.004816,0.996882,0.001371,0.001747
2,23971,2010-03-17,0.970289,0.027116,0.002595,0.997199,0.000993,0.001808,"Rättsliga dokument,Skrivelse",Finansdepartementet,Regeringens skrivelse 2009/10:153 Redov...,1,https://www.regeringen.se/rattsliga-dokument/s...,https://www.regeringen.se/contentassets/b3d401...,0.988013,0.008220,0.003767,0.997215,0.001168,0.001617
3,23972,2010-04-15,0.985701,0.011494,0.002805,0.996342,0.001223,0.002435,"Rättsliga dokument,Skrivelse",Finansdepartementet,Regeringens skrivelse 2009/10:195 R...,1,https://www.regeringen.se/rattsliga-dokument/s...,https://www.regeringen.se/contentassets/5e13e6...,0.991046,0.006013,0.002941,0.998041,0.000867,0.001092
4,23973,2010-04-15,0.841206,0.155265,0.003529,0.996111,0.001277,0.002612,"Rättsliga dokument,Skrivelse",Finansdepartementet,1 Regeringens skrivelse 2009/10:102 Ut...,1,https://www.regeringen.se/rattsliga-dokument/s...,https://www.regeringen.se/contentassets/0e73f3...,0.987737,0.008680,0.003583,0.998221,0.000832,0.000948
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262,25097,2010-10-14,0.986507,0.011598,0.001894,0.995839,0.001434,0.002726,"Lagrådsremiss,Rättsliga dokument",Finansdepartementet,Lagrådsremiss Förfaranderegler för altern...,1,https://www.regeringen.se/rattsliga-dokument/l...,https://www.regeringen.se/contentassets/b9bc47...,0.988581,0.007922,0.003496,0.996288,0.001471,0.002241
263,25098,2010-10-15,0.987704,0.010358,0.001938,0.994348,0.002070,0.003582,"Lagrådsremiss,Rättsliga dokument",Finansdepartementet,Lagrådsremiss 1 Vissa tekniska mervärdessk...,1,https://www.regeringen.se/rattsliga-dokument/l...,https://www.regeringen.se/contentassets/93766c...,0.988235,0.008384,0.003382,0.997186,0.001197,0.001617
264,25099,2010-11-11,0.987210,0.011505,0.001284,0.997094,0.000960,0.001946,"Lagrådsremiss,Rättsliga dokument",Finansdepartementet,Lagrådsremiss 1 Ändrade bestämmelser o...,1,https://www.regeringen.se/rattsliga-dokument/l...,https://www.regeringen.se/contentassets/4b4015...,0.984703,0.010831,0.004466,0.997005,0.001206,0.001789
265,25100,2010-11-19,0.978293,0.017912,0.003796,0.995808,0.001422,0.002770,"Lagrådsremiss,Rättsliga dokument",Finansdepartementet,Lagrådsremiss 1 Skattefrihet för alkol...,1,https://www.regeringen.se/rattsliga-dokument/l...,https://www.regeringen.se/contentassets/667b17...,0.988969,0.007676,0.003355,0.995607,0.001699,0.002695
