# Packages

In [1]:
# Import Packages and Model Commands
from transformers import BertForSequenceClassification, BertTokenizerFast
import torch
import pandas as pd
import nltk
from tqdm import tqdm
import datetime
from nltk.corpus import stopwords

# Swedish stopwords
nltk.download('stopwords')
stop_words_swedish = set(stopwords.words('swedish'))

# Download the NLTK sentence tokenizer data
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ryanh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ryanh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Models

In [2]:
# Load pre-trained model and tokenizer from Recorded Future
tokenizer = BertTokenizerFast.from_pretrained("RecordedFuture/Swedish-Sentiment-Fear")
classifier_fear= BertForSequenceClassification.from_pretrained("RecordedFuture/Swedish-Sentiment-Fear")
classifier_violence = BertForSequenceClassification.from_pretrained("RecordedFuture/Swedish-Sentiment-Violence")

# Data

In [3]:
# Load Data
data = pd.read_csv("unclassified_data.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'unclassified_data.csv'

In [7]:
# Remove rows without text data
data = data[data['Text'] != 'NO CONTENT']

In [24]:
# Rename index column
data.rename(columns={'Unnamed: 0': 'Index'}, inplace=True)

In [25]:
data.head(1)

Unnamed: 0,Index,Date,Document Type,Source,PDF Indicator,Text,Content Page,PDF Link
0,0,02 november 2023,Remiss,Utbildningsdepartementet,1,2023 -04-26 U2023/01467 ...,https://www.regeringen.se/remisser/2023/11/inb...,https://www.regeringen.se/contentassets/c9981c...


# Test set

In [45]:
# First 5 rows of data into a test set
unprocessed_data = data.head(5)

In [46]:
len(unprocessed_data)

5

In [47]:
unprocessed_data.head(1)

Unnamed: 0,Index,Date,Document Type,Source,PDF Indicator,Text,Content Page,PDF Link
0,0,02 november 2023,Remiss,Utbildningsdepartementet,1,2023 -04-26 U2023/01467 ...,https://www.regeringen.se/remisser/2023/11/inb...,https://www.regeringen.se/contentassets/c9981c...


# Classify Test Data

### Set up lists for sentence classification

In [77]:
# Lists to store probabilities, texts, and IDs for both classifiers
probabilities_fear_list = []
probabilities_violence_list = []
texts_list = []
ids_list = []

In [78]:
# Extract texts and IDs from the desired columns (assuming column names are "text_column" and "id_column")
text_entries = unprocessed_data['Text'].tolist()
ids = unprocessed_data['Index'].tolist()

### Sentence classification

In [79]:
for text, text_id in tqdm(zip(text_entries, ids), total=len(text_entries), desc="Processing Texts"):
    sentences = nltk.sent_tokenize(text)
    
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512)
        
        # Calculate probabilities for classifier_fear
        outputs_fear = classifier_fear(**inputs)
        probabilities_fear = torch.nn.functional.softmax(outputs_fear.logits, dim=1).tolist()[0]
        probabilities_fear_list.append(probabilities_fear)
        
        # Calculate probabilities for classifier_violence
        outputs_violence = classifier_violence(**inputs)  # Use classifier_violence here
        probabilities_violence = torch.nn.functional.softmax(outputs_violence.logits, dim=1).tolist()[0]
        probabilities_violence_list.append(probabilities_violence)
        
        texts_list.append(sentence)
        ids_list.append(text_id)

# Now, probabilities_fear_list and probabilities_violence_list contain probabilities
# for classifier_fear and classifier_violence respectively, for each sentence.

Processing Texts: 100%|██████████| 5/5 [00:49<00:00,  9.98s/it]


In [82]:
# Creating the classified_sentence_data DataFrame
classified_sentence_data = pd.DataFrame({
    'Fear Class 0 Probability': [item[0] for item in probabilities_fear_list],
    'Fear Class 1 Probability': [item[1] for item in probabilities_fear_list],
    'Fear Class 2 Probability': [item[2] for item in probabilities_fear_list],
    'Violence Class 0 Probability': [item[0] for item in probabilities_violence_list],
    'Violence Class 1 Probability': [item[1] for item in probabilities_violence_list],
    'Violence Class 2 Probability': [item[2] for item in probabilities_violence_list],
    'Text': texts_list,
    'ID': ids_list
})

In [84]:
# Merging additional columns from unprocessed_data based on 'Index' and 'ID'
classified_sentence_data = pd.merge(classified_sentence_data, unprocessed_data[['Index', 'Date', 'Document Type', 'Source', 'PDF Indicator', 'Content Page', 'PDF Link']], 
                                    left_on='ID', right_on='Index', how='left')

In [86]:
# Dropping the redundant 'Index' column
classified_sentence_data.drop(columns=['Index'], inplace=True)

In [87]:
# Reordering the columns
classified_sentence_data = classified_sentence_data[['ID', 'Date',
                                                     'Fear Class 0 Probability', 'Fear Class 1 Probability', 'Fear Class 2 Probability',
                                                     'Violence Class 0 Probability', 'Violence Class 1 Probability', 'Violence Class 2 Probability',
                                                     'Document Type', 'Source', 'PDF Indicator', 'Text', 'Content Page', 'PDF Link']]

In [88]:
classified_sentence_data.head(1)

Unnamed: 0,ID,Date,Fear Class 0 Probability,Fear Class 1 Probability,Fear Class 2 Probability,Violence Class 0 Probability,Violence Class 1 Probability,Violence Class 2 Probability,Document Type,Source,PDF Indicator,Text,Content Page,PDF Link
0,0,02 november 2023,0.996882,0.002071,0.001046,0.999418,0.000276,0.000306,Remiss,Utbildningsdepartementet,1,2023 -04-26 U2023/01467 ...,https://www.regeringen.se/remisser/2023/11/inb...,https://www.regeringen.se/contentassets/c9981c...


### Classify articles by sentence

In [89]:
# Group result_df by 'ID'
classified_sentence_data_grouped = classified_sentence_data.groupby('ID')

In [90]:
# Lists to store data for the new DataFrame for fear probabilities
new_ids = []
new_texts = []
new_fear_class_0_probs = []
new_fear_class_1_probs = []
new_fear_class_2_probs = []
new_violence_class_0_probs = []
new_violence_class_1_probs = []
new_violence_class_2_probs = []

In [91]:
# Fear Classification
# Iterate through grouped_df
for id, group in tqdm(classified_sentence_data_grouped, desc="Processing Groups"):
    num_rows = len(group)
    # If there are less than 10 rows, select the row with the lowest Class 0 probability
    if num_rows < 10:
        min_index = group['Fear Class 0 Probability'].idxmin()
        selected_row = group.loc[min_index]
        new_ids.append(id)
        new_texts.append(selected_row['Text'])
        new_fear_class_0_probs.append(selected_row['Fear Class 0 Probability'])
        new_fear_class_1_probs.append(selected_row['Fear Class 1 Probability'])
        new_fear_class_2_probs.append(selected_row['Fear Class 2 Probability'])
    # If there are 10-49 rows, select the two rows with the lowest Class 0 probabilities
    elif 10 <= num_rows < 50:
        selected_rows = group.nsmallest(2, 'Fear Class 0 Probability')
        avg_class_0_prob = selected_rows['Fear Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Fear Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Fear Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_fear_class_0_probs.append(avg_class_0_prob)
        new_fear_class_1_probs.append(avg_class_1_prob)
        new_fear_class_2_probs.append(avg_class_2_prob)
    # If there are 50-99 rows, select the three rows with the lowest Class 0 probabilities
    elif 50 <= num_rows < 100:
        selected_rows = group.nsmallest(3, 'Fear Class 0 Probability')
        avg_class_0_prob = selected_rows['Fear Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Fear Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Fear Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_fear_class_0_probs.append(avg_class_0_prob)
        new_fear_class_1_probs.append(avg_class_1_prob)
        new_fear_class_2_probs.append(avg_class_2_prob)
    # If there are 100-199 rows, select the four rows with the lowest Class 0 probabilities
    elif 100 <= num_rows < 200:
        selected_rows = group.nsmallest(4, 'Fear Class 0 Probability')
        avg_class_0_prob = selected_rows['Fear Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Fear Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Fear Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_fear_class_0_probs.append(avg_class_0_prob)
        new_fear_class_1_probs.append(avg_class_1_prob)
        new_fear_class_2_probs.append(avg_class_2_prob)
    # If there are 200 or more rows, select the five rows with the lowest Class 0 probabilities
    else:
        selected_rows = group.nsmallest(5, 'Fear Class 0 Probability')
        avg_class_0_prob = selected_rows['Fear Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Fear Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Fear Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_fear_class_0_probs.append(avg_class_0_prob)
        new_fear_class_1_probs.append(avg_class_1_prob)
        new_fear_class_2_probs.append(avg_class_2_prob)

Processing Groups: 100%|██████████| 5/5 [00:00<00:00, 180.28it/s]


In [92]:
# Lists to store data for the new DataFrame for violence probabilities
new_ids = []
new_texts = []
new_violence_class_0_probs = []
new_violence_class_1_probs = []
new_violence_class_2_probs = []

In [93]:
# Violence Classification
# Iterate through grouped_df
for id, group in tqdm(classified_sentence_data_grouped, desc="Processing Groups"):
    num_rows = len(group)
    # If there are less than 10 rows, select the row with the lowest Class 0 probability
    if num_rows < 10:
        min_index = group['Violence Class 0 Probability'].idxmin()
        selected_row = group.loc[min_index]
        new_ids.append(id)
        new_texts.append(selected_row['Text'])
        new_violence_class_0_probs.append(selected_row['Violence Class 0 Probability'])
        new_violence_class_1_probs.append(selected_row['Violence Class 1 Probability'])
        new_violence_class_2_probs.append(selected_row['Violence Class 2 Probability'])
    # If there are 10-49 rows, select the two rows with the lowest Class 0 probabilities
    elif 10 <= num_rows < 50:
        selected_rows = group.nsmallest(2, 'Violence Class 0 Probability')
        avg_class_0_prob = selected_rows['Violence Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Violence Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Violence Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_violence_class_0_probs.append(avg_class_0_prob)
        new_violence_class_1_probs.append(avg_class_1_prob)
        new_violence_class_2_probs.append(avg_class_2_prob)
    # If there are 50-99 rows, select the three rows with the lowest Class 0 probabilities
    elif 50 <= num_rows < 100:
        selected_rows = group.nsmallest(3, 'Violence Class 0 Probability')
        avg_class_0_prob = selected_rows['Violence Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Violence Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Violence Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_violence_class_0_probs.append(avg_class_0_prob)
        new_violence_class_1_probs.append(avg_class_1_prob)
        new_violence_class_2_probs.append(avg_class_2_prob)
    # If there are 100-199 rows, select the four rows with the lowest Class 0 probabilities
    elif 100 <= num_rows < 200:
        selected_rows = group.nsmallest(4, 'Violence Class 0 Probability')
        avg_class_0_prob = selected_rows['Violence Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Violence Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Violence Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_violence_class_0_probs.append(avg_class_0_prob)
        new_violence_class_1_probs.append(avg_class_1_prob)
        new_violence_class_2_probs.append(avg_class_2_prob)
    # If there are 200 or more rows, select the five rows with the lowest Class 0 probabilities
    else:
        selected_rows = group.nsmallest(5, 'Violence Class 0 Probability')
        avg_class_0_prob = selected_rows['Violence Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Violence Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Violence Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_violence_class_0_probs.append(avg_class_0_prob)
        new_violence_class_1_probs.append(avg_class_1_prob)
        new_violence_class_2_probs.append(avg_class_2_prob)

Processing Groups: 100%|██████████| 5/5 [00:00<00:00, 430.12it/s]


In [100]:
# Create a new DataFrame containing the original unprocessed data but with the representative classification scores
processed_data = pd.DataFrame({
    'ID': new_ids,
    'Date': unprocessed_data['Date'],
    'Fear Class 0 Probability': new_fear_class_0_probs,
    'Fear Class 1 Probability': new_fear_class_1_probs,
    'Fear Class 2 Probability': new_fear_class_2_probs,
    'Violence Class 0 Probability': new_violence_class_0_probs,
    'Violence Class 1 Probability': new_violence_class_1_probs,
    'Violence Class 2 Probability': new_violence_class_2_probs,
    'Document Type': unprocessed_data['Document Type'],
    'Source': unprocessed_data['Source'],
    'Text': unprocessed_data['Text'],
    'PDF Indicator': unprocessed_data['PDF Indicator'],
    'Content Page': unprocessed_data['Content Page'],
    'PDF Link': unprocessed_data['PDF Link']
})

### Classify articles as wholes without stopwords. This abandons the sentences approach and just looks at non-stopwords across the entire article

In [102]:
# Lists to store probabilities, texts, and IDs for both classifiers
probabilities_fear_list_2 = []
probabilities_violence_list_2 = []
texts_list_2 = []
ids_list_2 = []

In [103]:
text_entries_2 = unprocessed_data['Text'].tolist()
ids_2 = unprocessed_data['Index'].tolist()

In [104]:
for text, text_id in tqdm(zip(text_entries_2, ids_2), total=len(text_entries_2), desc="Processing Texts"):
    # Lowercase the entire text
    text_lower = text.lower()

    # Tokenize and remove stop words
    tokens = nltk.word_tokenize(text_lower)
    tokens_filtered = [word for word in tokens if word.isalnum() and word not in stop_words_swedish]

    # Reconstruct the text
    text_filtered = ' '.join(tokens_filtered)

    # Process the entire text at once
    inputs = tokenizer(text_filtered, return_tensors="pt", truncation=True, max_length=512)

    # Calculate probabilities for classifier_fear
    outputs_fear = classifier_fear(**inputs)
    probabilities_fear = torch.nn.functional.softmax(outputs_fear.logits, dim=1).tolist()[0]
    probabilities_fear_list_2.append(probabilities_fear)

    # Calculate probabilities for classifier_violence
    outputs_violence = classifier_violence(**inputs)
    probabilities_violence = torch.nn.functional.softmax(outputs_violence.logits, dim=1).tolist()[0]
    probabilities_violence_list_2.append(probabilities_violence)

    texts_list_2.append(text_filtered)
    ids_list_2.append(text_id)

# Now, probabilities_fear_list_2 and probabilities_violence_list_2 contain probabilities
# for classifier_fear and classifier_violence respectively, for each text.

Processing Texts: 100%|██████████| 5/5 [00:08<00:00,  1.64s/it]


In [105]:
# Creating the classified_sentence_data DataFrame
classified_sentence_data_2 = pd.DataFrame({
    'Fear Class 0 Probability_2': [item[0] for item in probabilities_fear_list_2],
    'Fear Class 1 Probability_2': [item[1] for item in probabilities_fear_list_2],
    'Fear Class 2 Probability_2': [item[2] for item in probabilities_fear_list_2],
    'Violence Class 0 Probability_2': [item[0] for item in probabilities_violence_list_2],
    'Violence Class 1 Probability_2': [item[1] for item in probabilities_violence_list_2],
    'Violence Class 2 Probability_2': [item[2] for item in probabilities_violence_list_2],
    'Text_2': texts_list_2,
    'ID_2': ids_list_2
})

In [106]:
classified_sentence_data_2

Unnamed: 0,Fear Class 0 Probability_2,Fear Class 1 Probability_2,Fear Class 2 Probability_2,Violence Class 0 Probability_2,Violence Class 1 Probability_2,Violence Class 2 Probability_2,Text_2,ID_2
0,0.976913,0.018334,0.004753,0.996448,0.001466,0.002086,2023 utbildningsdepartementet utbildningsminis...,0
1,0.943017,0.040467,0.016517,0.996085,0.002019,0.001896,regeringsbeslut 1 2023 klimat näringslivsdepar...,1
2,0.985671,0.010385,0.003944,0.996642,0.001547,0.001811,remiss 2023 arbetsmarknadsdepartementet enhete...,2
3,0.990124,0.007137,0.002739,0.997457,0.001298,0.001245,remiss 2023 01488 försvarsdepartementet enhete...,3
4,0.984264,0.011228,0.004507,0.997125,0.001188,0.001687,remiss 2023 reviderat socialdepartementet enhe...,4


### Merge the whole-text classifications and processed_data

In [108]:
# Columns to be joined from classified_sentence_data_2
columns_to_join = ['ID_2', 'Fear Class 0 Probability_2', 'Fear Class 1 Probability_2', 'Fear Class 2 Probability_2',
                    'Violence Class 0 Probability_2', 'Violence Class 1 Probability_2', 'Violence Class 2 Probability_2']

In [109]:
# Merge processed_data with the selected columns from classified_sentence_data_2
fully_classified_data = pd.merge(processed_data, classified_sentence_data_2[columns_to_join], how='left', left_on='ID', right_on='ID_2')

In [111]:
# Drop the redundant 'ID_2' column
fully_classified_data.drop(columns=['ID_2'], inplace=True)

### Clean up and organize the data in fully_classified_data

In [114]:
# Rename probability columns
fully_classified_data.rename(columns={
    'Fear Class 0 Probability': 'Fear 0 Sen',
    'Fear Class 1 Probability': 'Fear 1 Sen',
    'Fear Class 2 Probability': 'Fear 2 Sen',
    'Fear Class 0 Probability_2': 'Fear 0 Whole',
    'Fear Class 1 Probability_2': 'Fear 1 Whole',
    'Fear Class 2 Probability_2': 'Fear 2 Whole',
    'Violence Class 0 Probability_2': 'Violence 0 Whole',
    'Violence Class 1 Probability_2': 'Violence 1 Whole',
    'Violence Class 2 Probability_2': 'Violence 2 Whole',
    'Violence Class 0 Probability': 'Violence 0 Sen',
    'Violence Class 1 Probability': 'Violence 1 Sen',
    'Violence Class 2 Probability': 'Violence 2 Sen'
}, inplace=True)

# Keywords

### Define keywords

In [119]:

# Combined list of Swedish words related to immigration, integration, assimilation, Middle Eastern cultures, and languages
keywords = [
    "Invandring", "Migrationspolitik", "Asylsökande", "Flyktingar", "Immigrant", "Utvandring",
    "Integration", "Integrationspolitik", "Mångkultur", "Integrationstjänster", "Integrationssvårigheter", "Integrationsprocess",
    "Assimilation", "Anpassning", "Kulturell assimilering", "Kulturell anpassning", "Språklig assimilering", "Social assimilering",
    "Arabisk", "Syrisk", "Irakisk", "Iransk", "Palestinsk", "Libanesisk", "Turkisk", "Kurdisk", "Persisk",
    "Araber", "Syrier", "Irakier", "Iranier", "Palestinier", "Libaneser", "Turkar", "Kurder"
]

# Additional words related to immigration, integration, refugees, migration, and assimilation
additional_keywords = [
    "Invandring", "Integration", "Flykting", "Asyl", "Migrationsverket", "Anhöriginvandring", "Utlänning", 
    "Samhällsintegration", "Språkundervisning", "Mångfald", "Tolerans", "Diskriminering", "Rasism", "Inkludering", 
    "Immigrationslagar", "Gränskontroll", "Upphållstillstånd", "Integrationspolitik", 
    "Skyddsbehövande", "Internflykting", "Utvisning", "Assimilering", "Återvandring", 
    "Anpassning", "Kulturkrock", "Etnicitet", "Terrorism", "Muslim", "Islam", "Segregation", "Assimilation",
    "Syrien", "Iran", "Turkiet", "Irak", "Palestina", "Libanon", "Mellanöstern"
]

# Remove duplicates and add additional_keywords to the original list
keywords = list(set(keywords + additional_keywords))

# Dictionary to store keyword frequencies
keyword_frequencies = {keyword: [] for keyword in keywords}

### Dummies for keywords

In [121]:
# Add keywords to a df called processed_data_keyword_coded that combines keyword binary with the fully_classified_data
for keyword in keywords:
    # Iterate through each keyword and check its presence in each text entry
    keyword_occurrences = fully_classified_data['Text'].str.contains(keyword, case=False, na=False)
    keyword_frequencies[keyword] = keyword_occurrences.astype(int)

# Create a new DataFrame to store the keyword frequencies
keyword_df = pd.DataFrame(keyword_frequencies)

# Concatenate the keyword frequencies DataFrame with the original DataFrame
processed_data_keyword_coded = pd.concat([fully_classified_data, keyword_df], axis=1)

### Reformat time

In [123]:
# Separate Months and Years for processed_data_with_keywords
# Custom mapping for Swedish month names to English month names
month_mapping = {
    'januari': 'January',
    'februari': 'February',
    'mars': 'March',
    'april': 'April',
    'maj': 'May',
    'juni': 'June',
    'juli': 'July',
    'augusti': 'August',
    'september': 'September',
    'oktober': 'October',
    'november': 'November',
    'december': 'December'
}

# Function to convert Swedish month names to English
def convert_swedish_to_english(date_string):
    day, month, year = date_string.split(' ')
    month = month_mapping[month.lower()]
    return f"{day} {month} {year}"

# Convert Times on processed_data_keyword_coded
# Apply the conversion function to the 'Date' column
processed_data_keyword_coded['Date'] = processed_data_keyword_coded['Date'].apply(convert_swedish_to_english)

# Convert 'Date' column to datetime format
processed_data_keyword_coded['Date'] = pd.to_datetime(processed_data_keyword_coded['Date'], format='%d %B %Y')

# Extract month and year into new columns
processed_data_keyword_coded['Month'] = processed_data_keyword_coded['Date'].dt.month
processed_data_keyword_coded['Year'] = processed_data_keyword_coded['Date'].dt.year

### Mark content with keywords present

In [125]:
# Check each row for the presence of any keywords; if one is there, make 'Keyword Present' into 1; otherwise make it 0
processed_data_keyword_coded['Keyword Present'] = processed_data_keyword_coded[keywords].any(axis=1).astype(int)

In [126]:
# Add a collection of the keywords in each text to each row of processed_data_keyword_coded
processed_data_keyword_coded['Keywords'] = processed_data_keyword_coded['Text'].apply(lambda text: [keyword for keyword in keywords if keyword.lower() in text.lower()])

# Calculate F-V Scores

### Sentence-based calculation

In [132]:
# Calculate F-V Score for each row of the processed_data_keyword_coded data
processed_data_keyword_coded['Sentence FVS'] = (1 - ((processed_data_keyword_coded['Fear 0 Sen'] + processed_data_keyword_coded['Violence 0 Sen']) / 2)) * 100

### Article-based calculation

In [135]:
# Calculate F-V Score for each row of the processed_data_keyword_coded data
processed_data_keyword_coded['Article FVS'] = (1 - ((processed_data_keyword_coded['Fear 0 Whole'] + processed_data_keyword_coded['Violence 0 Whole']) / 2)) * 100

### Averaged calcuation

In [137]:
# Calculate F-V Score for each row of the processed_data_keyword_coded data
processed_data_keyword_coded['Average FVS'] = (processed_data_keyword_coded['Sentence FVS'] + processed_data_keyword_coded['Article FVS']) / 2

# Save to .csv

In [None]:
# Save processed data to CSV
processed_data_keyword_coded.to_csv(output_file_name, index=False)

print(f"Processed data has been saved to {output_file_name}")