# Settings and Data

### Load General Packages and Settings

In [1]:
# Import Packages and Model Commands
from transformers import BertForSequenceClassification, BertTokenizerFast
import torch
import pandas as pd
import nltk
from tqdm import tqdm
import datetime

# Download the NLTK sentence tokenizer data
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ryanh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [154]:
# Load pre-trained model and tokenizer from Recorded Future
tokenizer = BertTokenizerFast.from_pretrained("RecordedFuture/Swedish-Sentiment-Fear")
classifier_fear= BertForSequenceClassification.from_pretrained("RecordedFuture/Swedish-Sentiment-Fear")
classifier_violence = BertForSequenceClassification.from_pretrained("RecordedFuture/Swedish-Sentiment-Violence")

### Load Data

In [155]:
# Load Data
unprocessed_data = pd.read_csv("swe_gov_docs_1_2_test.csv")
unprocessed_data = unprocessed_data.rename(columns = {"Unnamed: 0": "Index"})

# Classification

### Classify Sentences

In [156]:
# Lists to store probabilities, texts, and IDs for both classifiers
probabilities_fear_list = []
probabilities_violence_list = []
texts_list = []
ids_list = []

In [157]:
# Extract texts and IDs from the desired columns (assuming column names are "text_column" and "id_column")
text_entries = unprocessed_data['Text'].tolist()
ids = unprocessed_data['Index'].tolist()

In [158]:
for text, text_id in tqdm(zip(text_entries, ids), total=len(text_entries), desc="Processing Texts"):
    sentences = nltk.sent_tokenize(text)
    
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512)
        
        # Calculate probabilities for classifier_fear
        outputs_fear = classifier_fear(**inputs)
        probabilities_fear = torch.nn.functional.softmax(outputs_fear.logits, dim=1).tolist()[0]
        probabilities_fear_list.append(probabilities_fear)
        
        # Calculate probabilities for classifier_violence
        outputs_violence = classifier_violence(**inputs)  # Use classifier_violence here
        probabilities_violence = torch.nn.functional.softmax(outputs_violence.logits, dim=1).tolist()[0]
        probabilities_violence_list.append(probabilities_violence)
        
        texts_list.append(sentence)
        ids_list.append(text_id)

# Now, probabilities_fear_list and probabilities_violence_list contain probabilities
# for classifier_fear and classifier_violence respectively, for each sentence.

Processing Texts: 100%|██████████| 40/40 [1:02:35<00:00, 93.88s/it] 


In [159]:
# Creating the classified_sentence_data DataFrame
classified_sentence_data = pd.DataFrame({
    'Fear Class 0 Probability': [item[0] for item in probabilities_fear_list],
    'Fear Class 1 Probability': [item[1] for item in probabilities_fear_list],
    'Fear Class 2 Probability': [item[2] for item in probabilities_fear_list],
    'Violence Class 0 Probability': [item[0] for item in probabilities_violence_list],
    'Violence Class 1 Probability': [item[1] for item in probabilities_violence_list],
    'Violence Class 2 Probability': [item[2] for item in probabilities_violence_list],
    'Text': texts_list,
    'ID': ids_list
})
# Merging additional columns from unprocessed_data based on 'Index' and 'ID'
classified_sentence_data = pd.merge(classified_sentence_data, unprocessed_data[['Index', 'Source', 'Date', 'Document_Type', 'URL']], 
                                    left_on='ID', right_on='Index', how='left')

# Dropping the redundant 'Index' column
classified_sentence_data.drop(columns=['Index'], inplace=True)

# Reordering the columns
classified_sentence_data = classified_sentence_data[['ID', 'Date',
                                                     'Fear Class 0 Probability', 'Fear Class 1 Probability', 'Fear Class 2 Probability',
                                                     'Violence Class 0 Probability', 'Violence Class 1 Probability', 'Violence Class 2 Probability',
                                                     'Text', 'Source', 'Document_Type', 'URL']]

### Classify Articles

In [160]:
# Group result_df by 'ID'
classified_sentence_data_grouped = classified_sentence_data.groupby('ID')

In [161]:
# Lists to store data for the new DataFrame for fear probabilities
new_ids = []
new_texts = []
new_fear_class_0_probs = []
new_fear_class_1_probs = []
new_fear_class_2_probs = []
new_violence_class_0_probs = []
new_violence_class_1_probs = []
new_violence_class_2_probs = []

In [162]:
# Fear Classification
# Iterate through grouped_df
for id, group in tqdm(classified_sentence_data_grouped, desc="Processing Groups"):
    num_rows = len(group)
    # If there are less than 10 rows, select the row with the lowest Class 0 probability
    if num_rows < 10:
        min_index = group['Fear Class 0 Probability'].idxmin()
        selected_row = group.loc[min_index]
        new_ids.append(id)
        new_texts.append(selected_row['Text'])
        new_fear_class_0_probs.append(selected_row['Fear Class 0 Probability'])
        new_fear_class_1_probs.append(selected_row['Fear Class 1 Probability'])
        new_fear_class_2_probs.append(selected_row['Fear Class 2 Probability'])
    # If there are 10-49 rows, select the two rows with the lowest Class 0 probabilities
    elif 10 <= num_rows < 50:
        selected_rows = group.nsmallest(2, 'Fear Class 0 Probability')
        avg_class_0_prob = selected_rows['Fear Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Fear Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Fear Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_fear_class_0_probs.append(avg_class_0_prob)
        new_fear_class_1_probs.append(avg_class_1_prob)
        new_fear_class_2_probs.append(avg_class_2_prob)
    # If there are 50-99 rows, select the three rows with the lowest Class 0 probabilities
    elif 50 <= num_rows < 100:
        selected_rows = group.nsmallest(3, 'Fear Class 0 Probability')
        avg_class_0_prob = selected_rows['Fear Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Fear Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Fear Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_fear_class_0_probs.append(avg_class_0_prob)
        new_fear_class_1_probs.append(avg_class_1_prob)
        new_fear_class_2_probs.append(avg_class_2_prob)
    # If there are 100-199 rows, select the four rows with the lowest Class 0 probabilities
    elif 100 <= num_rows < 200:
        selected_rows = group.nsmallest(4, 'Fear Class 0 Probability')
        avg_class_0_prob = selected_rows['Fear Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Fear Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Fear Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_fear_class_0_probs.append(avg_class_0_prob)
        new_fear_class_1_probs.append(avg_class_1_prob)
        new_fear_class_2_probs.append(avg_class_2_prob)
    # If there are 200 or more rows, select the five rows with the lowest Class 0 probabilities
    else:
        selected_rows = group.nsmallest(5, 'Fear Class 0 Probability')
        avg_class_0_prob = selected_rows['Fear Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Fear Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Fear Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_fear_class_0_probs.append(avg_class_0_prob)
        new_fear_class_1_probs.append(avg_class_1_prob)
        new_fear_class_2_probs.append(avg_class_2_prob)

Processing Groups:   0%|          | 0/40 [00:00<?, ?it/s]

Processing Groups: 100%|██████████| 40/40 [00:00<00:00, 484.52it/s]


In [163]:
# Lists to store data for the new DataFrame for violence probabilities
new_ids = []
new_texts = []
new_violence_class_0_probs = []
new_violence_class_1_probs = []
new_violence_class_2_probs = []

In [164]:
# Violence Classification
# Iterate through grouped_df
for id, group in tqdm(classified_sentence_data_grouped, desc="Processing Groups"):
    num_rows = len(group)
    # If there are less than 10 rows, select the row with the lowest Class 0 probability
    if num_rows < 10:
        min_index = group['Violence Class 0 Probability'].idxmin()
        selected_row = group.loc[min_index]
        new_ids.append(id)
        new_texts.append(selected_row['Text'])
        new_violence_class_0_probs.append(selected_row['Violence Class 0 Probability'])
        new_violence_class_1_probs.append(selected_row['Violence Class 1 Probability'])
        new_violence_class_2_probs.append(selected_row['Violence Class 2 Probability'])
    # If there are 10-49 rows, select the two rows with the lowest Class 0 probabilities
    elif 10 <= num_rows < 50:
        selected_rows = group.nsmallest(2, 'Violence Class 0 Probability')
        avg_class_0_prob = selected_rows['Violence Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Violence Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Violence Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_violence_class_0_probs.append(avg_class_0_prob)
        new_violence_class_1_probs.append(avg_class_1_prob)
        new_violence_class_2_probs.append(avg_class_2_prob)
    # If there are 50-99 rows, select the three rows with the lowest Class 0 probabilities
    elif 50 <= num_rows < 100:
        selected_rows = group.nsmallest(3, 'Violence Class 0 Probability')
        avg_class_0_prob = selected_rows['Violence Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Violence Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Violence Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_violence_class_0_probs.append(avg_class_0_prob)
        new_violence_class_1_probs.append(avg_class_1_prob)
        new_violence_class_2_probs.append(avg_class_2_prob)
    # If there are 100-199 rows, select the four rows with the lowest Class 0 probabilities
    elif 100 <= num_rows < 200:
        selected_rows = group.nsmallest(4, 'Violence Class 0 Probability')
        avg_class_0_prob = selected_rows['Violence Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Violence Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Violence Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_violence_class_0_probs.append(avg_class_0_prob)
        new_violence_class_1_probs.append(avg_class_1_prob)
        new_violence_class_2_probs.append(avg_class_2_prob)
    # If there are 200 or more rows, select the five rows with the lowest Class 0 probabilities
    else:
        selected_rows = group.nsmallest(5, 'Violence Class 0 Probability')
        avg_class_0_prob = selected_rows['Violence Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Violence Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Violence Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_violence_class_0_probs.append(avg_class_0_prob)
        new_violence_class_1_probs.append(avg_class_1_prob)
        new_violence_class_2_probs.append(avg_class_2_prob)

Processing Groups: 100%|██████████| 40/40 [00:00<00:00, 407.20it/s]


In [165]:
# Create a new DataFrame containing the original unprocessed data but with the representative classification scores
processed_data = pd.DataFrame({
    'ID': new_ids,
    'Fear Class 0 Probability': new_fear_class_0_probs,
    'Fear Class 1 Probability': new_fear_class_1_probs,
    'Fear Class 2 Probability': new_fear_class_2_probs,
    'Violence Class 0 Probability': new_violence_class_0_probs,
    'Violence Class 1 Probability': new_violence_class_1_probs,
    'Violence Class 2 Probability': new_violence_class_2_probs,
    'Date': unprocessed_data['Date'],
    'DocumentType': unprocessed_data['Document_Type'],
    'Source': unprocessed_data['Source'],
    'Text': unprocessed_data['Text'],
    'URL': unprocessed_data['URL']
})

### Download Classified Data

In [166]:
# Save the processed DataFrame as a CSV file
processed_data.to_csv('processed_data.csv', index=False)

# Keywords

### Load Data

In [2]:
# Load the CSV file into a DataFrame
processed_data = pd.read_csv('processed_data.csv')

### Define Keywords

In [3]:
# Combined list of Swedish words related to immigration, integration, assimilation, Middle Eastern cultures, and languages
keywords = [
    "Invandring", "Migrationspolitik", "Asylsökande", "Flyktingar", "Immigrant", "Utvandring",
    "Integration", "Integrationspolitik", "Mångkultur", "Integrationstjänster", "Integrationssvårigheter", "Integrationsprocess",
    "Assimilation", "Anpassning", "Kulturell assimilering", "Kulturell anpassning", "Språklig assimilering", "Social assimilering",
    "Arabisk", "Syrisk", "Irakisk", "Iransk", "Palestinsk", "Libanesisk", "Turkisk", "Kurdisk", "Persisk",
    "Araber", "Syrier", "Irakier", "Iranier", "Palestinier", "Libaneser", "Turkar", "Kurder"
]

# Additional words related to immigration, integration, refugees, migration, and assimilation
additional_keywords = [
    "Invandring", "Integration", "Flykting", "Asyl", "Migrationsverket", "Anhöriginvandring", "Utlänning", 
    "Samhällsintegration", "Språkundervisning", "Mångfald", "Tolerans", "Diskriminering", "Rasism", "Inkludering", 
    "Immigrationslagar", "Gränskontroll", "Upphållstillstånd", "Integrationspolitik", 
    "Skyddsbehövande", "Internflykting", "Utvisning", "Assimilering", "Återvandring", 
    "Anpassning", "Kulturkrock", "Etnicitet", "Terrorism", "Muslim", "Islam", "Segregation", "Assimilation",
    "Syrien", "Iran", "Turkiet", "Irak", "Palestina", "Libanon", "Mellanöstern"
]

# Remove duplicates and add additional_keywords to the original list
keywords = list(set(keywords + additional_keywords))

# Dictionary to store keyword frequencies
keyword_frequencies = {keyword: [] for keyword in keywords}

### Calculate the Frequency of Keywords by Content Piece

In [4]:
# Use tqdm to create a progress bar for the loop
for keyword in tqdm(keywords, desc="Processing Keywords"):
    # Iterate through each keyword and check its presence in each text entry
    keyword_occurrences = processed_data['Text'].str.contains(keyword, case=False, na=False) # case insensitive because case = False
    keyword_frequencies[keyword] = keyword_occurrences.astype(int)

# Create a new DataFrame to store the keyword frequencies
keyword_df = pd.DataFrame(keyword_frequencies)

# Concatenate the keyword frequencies DataFrame with the original DataFrame
processed_data_keyword_coded = pd.concat([processed_data, keyword_df], axis=1)

Processing Keywords: 100%|██████████| 63/63 [00:02<00:00, 21.61it/s]


# Dates

In [6]:
# Separate Months and Years for processed_data_with_keywords
# Custom mapping for Swedish month names to English month names
month_mapping = {
    'januari': 'January',
    'februari': 'February',
    'mars': 'March',
    'april': 'April',
    'maj': 'May',
    'juni': 'June',
    'juli': 'July',
    'augusti': 'August',
    'september': 'September',
    'oktober': 'October',
    'november': 'November',
    'december': 'December'
}

# Function to convert Swedish month names to English
def convert_swedish_to_english(date_string):
    day, month, year = date_string.split(' ')
    month = month_mapping[month.lower()]
    return f"{day} {month} {year}"

# Convert Times on processed_data_keyword_coded
# Apply the conversion function to the 'Date' column
processed_data_keyword_coded['Date'] = processed_data_keyword_coded['Date'].apply(convert_swedish_to_english)

# Convert 'Date' column to datetime format
processed_data_keyword_coded['Date'] = pd.to_datetime(processed_data_keyword_coded['Date'], format='%d %B %Y')

# Extract month and year into new columns
processed_data_keyword_coded['Month'] = processed_data_keyword_coded['Date'].dt.month
processed_data_keyword_coded['Year'] = processed_data_keyword_coded['Date'].dt.year


# Keywords Present?, FV Scores, and Specifying Keywords

In [8]:
# STEP 9: ADD INDICATORS SHOWING IF ANY KEYWORD IS PRESENT IN AN ARTICLE, THE F-V SCORE FOR EACH ARTICLE, AND THE SPECIFIC KEYWORDS PRESENT IN EACH ARTICLE

# Check each row for the presence of any keywords; if one is there, make 'Keyword Present' into 1; otherwise make it 0
processed_data_keyword_coded['Keyword Present'] = processed_data_keyword_coded[keywords].any(axis=1).astype(int)

# Calculate F-V Score for each row of the processed_data_keyword_coded data
processed_data_keyword_coded['F-V Score'] = (1 - ((processed_data_keyword_coded['Fear Class 0 Probability'] + processed_data_keyword_coded['Violence Class 0 Probability']) / 2)) * 100

# Add a collection of the keywords in each text to each row of processed_data_keyword_coded
processed_data_keyword_coded['Keywords'] = processed_data_keyword_coded['Text'].apply(lambda text: [keyword for keyword in keywords if keyword.lower() in text.lower()])

In [9]:
processed_data_keyword_coded

Unnamed: 0,ID,Fear Class 0 Probability,Fear Class 1 Probability,Fear Class 2 Probability,Violence Class 0 Probability,Violence Class 1 Probability,Violence Class 2 Probability,Date,DocumentType,Source,...,Internflykting,Rasism,Assimilation,Samhällsintegration,Integration,Month,Year,Keyword Present,F-V Score,Keywords
0,0,0.996384,0.002557,0.001059,0.999454,0.000237,0.000309,2023-10-25,Ärendeförteckning,"Arbetsmarknadsdepartementet,Finansdepartemente...",...,0,0,0,0,0,10,2023,0,0.208095,[]
1,1,0.985025,0.013341,0.001633,0.997682,0.000794,0.001524,2023-10-25,Kommenterad dagordning,Landsbygds- och infrastrukturdepartementet,...,0,0,0,0,0,10,2023,0,0.864657,[]
2,2,0.994334,0.003724,0.001943,0.997236,0.001277,0.001487,2023-10-25,Remiss,Justitiedepartementet,...,0,0,0,0,0,10,2023,1,0.421503,[Diskriminering]
3,3,0.754918,0.238354,0.006728,0.982671,0.003774,0.013556,2023-10-25,"Departementsserien och promemorior,Rättsliga d...",Justitiedepartementet,...,0,0,0,0,0,10,2023,1,13.120582,"[Migrationsverket, Utlänning, Utvisning, Anpas..."
4,4,0.852181,0.085894,0.061925,0.998921,0.000461,0.000619,2023-10-25,Regeringsuppdrag,"Justitiedepartementet,Regeringen",...,0,0,0,0,0,10,2023,0,7.444933,[]
5,5,0.99627,0.002704,0.001027,0.998764,0.000592,0.000644,2023-06-27,Remiss,Arbetsmarknadsdepartementet,...,0,0,0,0,1,6,2023,1,0.248309,"[Migrationsverket, Segregation, Diskriminering..."
6,6,0.995365,0.003028,0.001607,0.998342,0.000822,0.000836,2023-09-08,Remiss,Försvarsdepartementet,...,0,0,0,0,0,9,2023,0,0.314632,[]
7,7,0.093567,0.856172,0.050261,0.693625,0.25279,0.053585,2014-03-25,"Departementsserien och promemorior,Rättsliga d...",Kulturdepartementet,...,0,1,1,0,1,3,2014,1,60.640395,"[Asyl, Gränskontroll, Assimilering, Utlänning,..."
8,8,0.845011,0.113078,0.041911,0.992598,0.002403,0.005,2023-10-24,"Proposition,Rättsliga dokument","Landsbygds- och infrastrukturdepartementet,Reg...",...,0,0,0,0,1,10,2023,1,8.119575,"[Gränskontroll, Anpassning, Tolerans, Flykting..."
9,9,0.985345,0.008876,0.005779,0.999192,0.000291,0.000517,2023-10-24,"Proposition,Rättsliga dokument",Socialdepartementet,...,0,0,0,0,0,10,2023,0,0.773138,[]


# Output processed_data_keyword_coded to .csv

In [None]:
# STEP 10: OUTPUT VARIABLE .CSV FILES

# Save processed data to CSV
processed_data_keyword_coded.to_csv(output_file_name, index=False)

print(f"Processed data has been saved to {output_file_name}")