# Settings and Data

### Load General Packages and Settings

In [153]:
# Import Packages and Model Commands
from transformers import BertForSequenceClassification, BertTokenizerFast
import torch
import pandas as pd
import nltk
from tqdm import tqdm
import datetime

# Download the NLTK sentence tokenizer data
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ryanh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [154]:
# Load pre-trained model and tokenizer from Recorded Future
tokenizer = BertTokenizerFast.from_pretrained("RecordedFuture/Swedish-Sentiment-Fear")
classifier_fear= BertForSequenceClassification.from_pretrained("RecordedFuture/Swedish-Sentiment-Fear")
classifier_violence = BertForSequenceClassification.from_pretrained("RecordedFuture/Swedish-Sentiment-Violence")

### Load Data

In [155]:
# Load Data
unprocessed_data = pd.read_csv("swe_gov_docs_1_2_test.csv")
unprocessed_data = unprocessed_data.rename(columns = {"Unnamed: 0": "Index"})

# Classification

### Classify Sentences

In [156]:
# Lists to store probabilities, texts, and IDs for both classifiers
probabilities_fear_list = []
probabilities_violence_list = []
texts_list = []
ids_list = []

In [157]:
# Extract texts and IDs from the desired columns (assuming column names are "text_column" and "id_column")
text_entries = unprocessed_data['Text'].tolist()
ids = unprocessed_data['Index'].tolist()

In [158]:
for text, text_id in tqdm(zip(text_entries, ids), total=len(text_entries), desc="Processing Texts"):
    sentences = nltk.sent_tokenize(text)
    
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512)
        
        # Calculate probabilities for classifier_fear
        outputs_fear = classifier_fear(**inputs)
        probabilities_fear = torch.nn.functional.softmax(outputs_fear.logits, dim=1).tolist()[0]
        probabilities_fear_list.append(probabilities_fear)
        
        # Calculate probabilities for classifier_violence
        outputs_violence = classifier_violence(**inputs)  # Use classifier_violence here
        probabilities_violence = torch.nn.functional.softmax(outputs_violence.logits, dim=1).tolist()[0]
        probabilities_violence_list.append(probabilities_violence)
        
        texts_list.append(sentence)
        ids_list.append(text_id)

# Now, probabilities_fear_list and probabilities_violence_list contain probabilities
# for classifier_fear and classifier_violence respectively, for each sentence.

Processing Texts: 100%|██████████| 40/40 [1:02:35<00:00, 93.88s/it] 


In [159]:
# Creating the classified_sentence_data DataFrame
classified_sentence_data = pd.DataFrame({
    'Fear Class 0 Probability': [item[0] for item in probabilities_fear_list],
    'Fear Class 1 Probability': [item[1] for item in probabilities_fear_list],
    'Fear Class 2 Probability': [item[2] for item in probabilities_fear_list],
    'Violence Class 0 Probability': [item[0] for item in probabilities_violence_list],
    'Violence Class 1 Probability': [item[1] for item in probabilities_violence_list],
    'Violence Class 2 Probability': [item[2] for item in probabilities_violence_list],
    'Text': texts_list,
    'ID': ids_list
})
# Merging additional columns from unprocessed_data based on 'Index' and 'ID'
classified_sentence_data = pd.merge(classified_sentence_data, unprocessed_data[['Index', 'Source', 'Date', 'Document_Type', 'URL']], 
                                    left_on='ID', right_on='Index', how='left')

# Dropping the redundant 'Index' column
classified_sentence_data.drop(columns=['Index'], inplace=True)

# Reordering the columns
classified_sentence_data = classified_sentence_data[['ID', 'Date',
                                                     'Fear Class 0 Probability', 'Fear Class 1 Probability', 'Fear Class 2 Probability',
                                                     'Violence Class 0 Probability', 'Violence Class 1 Probability', 'Violence Class 2 Probability',
                                                     'Text', 'Source', 'Document_Type', 'URL']]

### Classify Articles

In [160]:
# Group result_df by 'ID'
classified_sentence_data_grouped = classified_sentence_data.groupby('ID')

In [161]:
# Lists to store data for the new DataFrame for fear probabilities
new_ids = []
new_texts = []
new_fear_class_0_probs = []
new_fear_class_1_probs = []
new_fear_class_2_probs = []
new_violence_class_0_probs = []
new_violence_class_1_probs = []
new_violence_class_2_probs = []

In [162]:
# Fear Classification
# Iterate through grouped_df
for id, group in tqdm(classified_sentence_data_grouped, desc="Processing Groups"):
    num_rows = len(group)
    # If there are less than 10 rows, select the row with the lowest Class 0 probability
    if num_rows < 10:
        min_index = group['Fear Class 0 Probability'].idxmin()
        selected_row = group.loc[min_index]
        new_ids.append(id)
        new_texts.append(selected_row['Text'])
        new_fear_class_0_probs.append(selected_row['Fear Class 0 Probability'])
        new_fear_class_1_probs.append(selected_row['Fear Class 1 Probability'])
        new_fear_class_2_probs.append(selected_row['Fear Class 2 Probability'])
    # If there are 10-49 rows, select the two rows with the lowest Class 0 probabilities
    elif 10 <= num_rows < 50:
        selected_rows = group.nsmallest(2, 'Fear Class 0 Probability')
        avg_class_0_prob = selected_rows['Fear Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Fear Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Fear Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_fear_class_0_probs.append(avg_class_0_prob)
        new_fear_class_1_probs.append(avg_class_1_prob)
        new_fear_class_2_probs.append(avg_class_2_prob)
    # If there are 50-99 rows, select the three rows with the lowest Class 0 probabilities
    elif 50 <= num_rows < 100:
        selected_rows = group.nsmallest(3, 'Fear Class 0 Probability')
        avg_class_0_prob = selected_rows['Fear Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Fear Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Fear Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_fear_class_0_probs.append(avg_class_0_prob)
        new_fear_class_1_probs.append(avg_class_1_prob)
        new_fear_class_2_probs.append(avg_class_2_prob)
    # If there are 100-199 rows, select the four rows with the lowest Class 0 probabilities
    elif 100 <= num_rows < 200:
        selected_rows = group.nsmallest(4, 'Fear Class 0 Probability')
        avg_class_0_prob = selected_rows['Fear Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Fear Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Fear Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_fear_class_0_probs.append(avg_class_0_prob)
        new_fear_class_1_probs.append(avg_class_1_prob)
        new_fear_class_2_probs.append(avg_class_2_prob)
    # If there are 200 or more rows, select the five rows with the lowest Class 0 probabilities
    else:
        selected_rows = group.nsmallest(5, 'Fear Class 0 Probability')
        avg_class_0_prob = selected_rows['Fear Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Fear Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Fear Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_fear_class_0_probs.append(avg_class_0_prob)
        new_fear_class_1_probs.append(avg_class_1_prob)
        new_fear_class_2_probs.append(avg_class_2_prob)

Processing Groups:   0%|          | 0/40 [00:00<?, ?it/s]

Processing Groups: 100%|██████████| 40/40 [00:00<00:00, 484.52it/s]


In [163]:
# Lists to store data for the new DataFrame for violence probabilities
new_ids = []
new_texts = []
new_violence_class_0_probs = []
new_violence_class_1_probs = []
new_violence_class_2_probs = []

In [164]:
# Violence Classification
# Iterate through grouped_df
for id, group in tqdm(classified_sentence_data_grouped, desc="Processing Groups"):
    num_rows = len(group)
    # If there are less than 10 rows, select the row with the lowest Class 0 probability
    if num_rows < 10:
        min_index = group['Violence Class 0 Probability'].idxmin()
        selected_row = group.loc[min_index]
        new_ids.append(id)
        new_texts.append(selected_row['Text'])
        new_violence_class_0_probs.append(selected_row['Violence Class 0 Probability'])
        new_violence_class_1_probs.append(selected_row['Violence Class 1 Probability'])
        new_violence_class_2_probs.append(selected_row['Violence Class 2 Probability'])
    # If there are 10-49 rows, select the two rows with the lowest Class 0 probabilities
    elif 10 <= num_rows < 50:
        selected_rows = group.nsmallest(2, 'Violence Class 0 Probability')
        avg_class_0_prob = selected_rows['Violence Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Violence Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Violence Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_violence_class_0_probs.append(avg_class_0_prob)
        new_violence_class_1_probs.append(avg_class_1_prob)
        new_violence_class_2_probs.append(avg_class_2_prob)
    # If there are 50-99 rows, select the three rows with the lowest Class 0 probabilities
    elif 50 <= num_rows < 100:
        selected_rows = group.nsmallest(3, 'Violence Class 0 Probability')
        avg_class_0_prob = selected_rows['Violence Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Violence Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Violence Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_violence_class_0_probs.append(avg_class_0_prob)
        new_violence_class_1_probs.append(avg_class_1_prob)
        new_violence_class_2_probs.append(avg_class_2_prob)
    # If there are 100-199 rows, select the four rows with the lowest Class 0 probabilities
    elif 100 <= num_rows < 200:
        selected_rows = group.nsmallest(4, 'Violence Class 0 Probability')
        avg_class_0_prob = selected_rows['Violence Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Violence Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Violence Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_violence_class_0_probs.append(avg_class_0_prob)
        new_violence_class_1_probs.append(avg_class_1_prob)
        new_violence_class_2_probs.append(avg_class_2_prob)
    # If there are 200 or more rows, select the five rows with the lowest Class 0 probabilities
    else:
        selected_rows = group.nsmallest(5, 'Violence Class 0 Probability')
        avg_class_0_prob = selected_rows['Violence Class 0 Probability'].mean()
        avg_class_1_prob = selected_rows['Violence Class 1 Probability'].mean()
        avg_class_2_prob = selected_rows['Violence Class 2 Probability'].mean()
        new_ids.append(id)
        new_texts.append(selected_rows['Text'].values[0])  # Select the text from the first row
        new_violence_class_0_probs.append(avg_class_0_prob)
        new_violence_class_1_probs.append(avg_class_1_prob)
        new_violence_class_2_probs.append(avg_class_2_prob)

Processing Groups: 100%|██████████| 40/40 [00:00<00:00, 407.20it/s]


In [165]:
# Create a new DataFrame containing the original unprocessed data but with the representative classification scores
processed_data = pd.DataFrame({
    'ID': new_ids,
    'Fear Class 0 Probability': new_fear_class_0_probs,
    'Fear Class 1 Probability': new_fear_class_1_probs,
    'Fear Class 2 Probability': new_fear_class_2_probs,
    'Violence Class 0 Probability': new_violence_class_0_probs,
    'Violence Class 1 Probability': new_violence_class_1_probs,
    'Violence Class 2 Probability': new_violence_class_2_probs,
    'Date': unprocessed_data['Date'],
    'DocumentType': unprocessed_data['Document_Type'],
    'Source': unprocessed_data['Source'],
    'Text': unprocessed_data['Text'],
    'URL': unprocessed_data['URL']
})

### Download Classified Data

In [166]:
# Save the processed DataFrame as a CSV file
processed_data.to_csv('processed_data.csv', index=False)

# Identify and Sorty By Keywords

### Load Data

In [167]:
# Load the CSV file into a DataFrame
processed_data = pd.read_csv('processed_data.csv')

### Define Keywords

In [168]:
# Combined list of Swedish words related to immigration, integration, assimilation, Middle Eastern cultures, and languages
keywords = [
    "Invandring", "Migrationspolitik", "Asylsökande", "Flyktingar", "Immigrant", "Utvandring",
    "Integration", "Integrationspolitik", "Mångkultur", "Integrationstjänster", "Integrationssvårigheter", "Integrationsprocess",
    "Assimilation", "Anpassning", "Kulturell assimilering", "Kulturell anpassning", "Språklig assimilering", "Social assimilering",
    "Arabisk", "Syrisk", "Irakisk", "Iransk", "Palestinsk", "Libanesisk", "Turkisk", "Kurdisk",
    "Arabiska", "Syriska", "Persiska", "Kurdiska", "Turkiska", "Skräckvälde", "Skräckvälde"
]

# Additional words related to immigration, integration, refugees, migration, and assimilation
additional_keywords = [
    "Invandring", "Integration", "Flykting", "Asyl", "Migrationsverket", "Anhöriginvandring", "Utlänning", 
    "Samhällsintegration", "Språkundervisning", "Mångfald", "Tolerans", "Diskriminering", "Rasism", "Inkludering", 
    "Immigrationslagar", "Gränskontroll", "Upphållstillstånd", "Integrationspolitik", 
    "Skyddsbehövande", "Internflykting", "Utvisning", "HBTQ-flykting", "Assimilering", "Återvandring", 
    "Anpassning", "Kulturkrock", "Etnicitet", "Terrorism", "Muslim", "Muslimer", "Islam", "Segregation", "Assimilation"
]

# Remove duplicates and add additional_keywords to the original list
keywords = list(set(keywords + additional_keywords))

# Dictionary to store keyword frequencies
keyword_frequencies = {keyword: [] for keyword in keywords}

### Calculate the Frequency of Keywords by Content Piece

In [169]:
# Use tqdm to create a progress bar for the loop
for keyword in tqdm(keywords, desc="Processing Keywords"):
    # Iterate through each keyword and check its presence in each text entry
    keyword_occurrences = processed_data['Text'].str.contains(keyword, case=False, na=False)
    keyword_frequencies[keyword] = keyword_occurrences.astype(int)

# Create a new DataFrame to store the keyword frequencies
keyword_df = pd.DataFrame(keyword_frequencies)

# Concatenate the keyword frequencies DataFrame with the original DataFrame
processed_data_keyword_coded = pd.concat([processed_data, keyword_df], axis=1)

Processing Keywords: 100%|██████████| 60/60 [00:03<00:00, 19.88it/s]


### Table for Keyword Frequency

In [170]:
# Calculate totals for each word frequency column
word_frequencies_totals = processed_data_keyword_coded[keywords].sum()

# Sort word frequencies from max to min
sorted_word_frequencies = word_frequencies_totals.sort_values(ascending=False)

# View top 5 keywords
sorted_word_frequencies.head(5)

Migrationsverket    9
Flykting            8
Anpassning          7
Diskriminering      7
Integration         7
dtype: int64

### Create a Dataframe Containing Articles Containing Keywords and Add the Keyword Tags

In [171]:
# Create a boolean mask indicating rows where "Text" entries contain keywords
keyword_mask = processed_data['Text'].str.contains('|'.join(keywords), case=False, na=False)

# Create a new DataFrame containing only the rows with "Text" entries containing keywords
processed_data_with_keywords = processed_data[keyword_mask]

In [172]:
# Create a new DataFrame containing only the rows without "Text" entries containing keywords
processed_data_without_keywords = processed_data[~keyword_mask]

In [173]:
processed_data_with_keywords['Keywords'] = processed_data_with_keywords['Text'].apply(lambda text: [keyword for keyword in keywords if keyword.lower() in text.lower()])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_data_with_keywords['Keywords'] = processed_data_with_keywords['Text'].apply(lambda text: [keyword for keyword in keywords if keyword.lower() in text.lower()])


# Sentiment Analysis

### Add a Feature/Metric called "Negativity Score"

"F-V Score" is [1 - (the average of Fear Class 0 Probability and Violence Class 0 Probability)] * 100.
The F-V Score is bounded [0, 100] with scores closer to 100 indicating more fearful/violent sentiment

In [174]:
# Calculate F-V Score for each row of the keyword data
processed_data_with_keywords['F-V Score'] = (1 - ((processed_data_with_keywords['Fear Class 0 Probability'] + processed_data_with_keywords['Violence Class 0 Probability']) / 2)) * 100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_data_with_keywords['F-V Score'] = (1 - ((processed_data_with_keywords['Fear Class 0 Probability'] + processed_data_with_keywords['Violence Class 0 Probability']) / 2)) * 100


In [175]:
# Calculate F-V Score for each row of the without keyword data
processed_data_without_keywords['F-V Score'] = (1 - ((processed_data_without_keywords['Fear Class 0 Probability'] + processed_data_without_keywords['Violence Class 0 Probability']) / 2)) * 100

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  processed_data_without_keywords['F-V Score'] = (1 - ((processed_data_without_keywords['Fear Class 0 Probability'] + processed_data_without_keywords['Violence Class 0 Probability']) / 2)) * 100


In [176]:
# Rearrange columns in the w keyword data
processed_data_with_keywords = processed_data_with_keywords[['ID', 'F-V Score', 'Keywords', 'Fear Class 0 Probability', 'Fear Class 1 Probability', 'Fear Class 2 Probability', 'Violence Class 0 Probability', 'Violence Class 1 Probability', 'Violence Class 2 Probability', 'Date', 'DocumentType', 'Source', 'Text', 'URL']]

In [177]:
# Rearrange columns in the wo keyword data
processed_data_without_keywords = processed_data_without_keywords[['ID', 'F-V Score', 'Fear Class 0 Probability', 'Fear Class 1 Probability', 'Fear Class 2 Probability', 'Violence Class 0 Probability', 'Violence Class 1 Probability', 'Violence Class 2 Probability', 'Date', 'DocumentType', 'Source', 'Text', 'URL']]

### Compare F-V Scores Across Keyword Groups

In [178]:
# Assuming 'F-V Score' is the column name in both DataFrames
average_fv_score_with_keywords = processed_data_with_keywords['F-V Score'].mean()
average_fv_score_without_keywords = processed_data_without_keywords['F-V Score'].mean()

# Calculate the disparity in average scores
score_disparity = abs(average_fv_score_with_keywords - average_fv_score_without_keywords)

# Compare the average F-V Scores in a statement
if average_fv_score_with_keywords > average_fv_score_without_keywords:
    print(f"The average 'F-V Score' for rows with keywords ({average_fv_score_with_keywords:.2f}) is {score_disparity:.2f} higher than rows without keywords ({average_fv_score_without_keywords:.2f}).")
elif average_fv_score_with_keywords < average_fv_score_without_keywords:
    print(f"The average 'F-V Score' for rows with keywords ({average_fv_score_with_keywords:.2f}) is {score_disparity:.2f} lower than rows without keywords ({average_fv_score_without_keywords:.2f}).")
else:
    print("The average 'F-V Score' for rows with keywords is equal to rows without keywords.")

The average 'F-V Score' for rows with keywords (12.62) is 11.28 higher than rows without keywords (1.34).


### F-V Scores Over Time

In [179]:
# Separate Months and Years for processed_data_with_keywords
# Custom mapping for Swedish month names to English month names
month_mapping = {
    'januari': 'January',
    'februari': 'February',
    'mars': 'March',
    'april': 'April',
    'maj': 'May',
    'juni': 'June',
    'juli': 'July',
    'augusti': 'August',
    'september': 'September',
    'oktober': 'October',
    'november': 'November',
    'december': 'December'
}

# Function to convert Swedish month names to English
def convert_swedish_to_english(date_string):
    day, month, year = date_string.split(' ')
    month = month_mapping[month.lower()]
    return f"{day} {month} {year}"

In [180]:
# Apply the conversion function to the 'Date' column
processed_data_with_keywords['Date'] = processed_data_with_keywords['Date'].apply(convert_swedish_to_english)

# Convert 'Date' column to datetime format
processed_data_with_keywords['Date'] = pd.to_datetime(processed_data_with_keywords['Date'], format='%d %B %Y')

# Extract month and year into new columns
processed_data_with_keywords['Month'] = processed_data_with_keywords['Date'].dt.month
processed_data_with_keywords['Year'] = processed_data_with_keywords['Date'].dt.year

In [182]:
# Separate Months and Years for processed_data_without_keywords
# Apply the conversion function to the 'Date' column
processed_data_without_keywords['Date'] = processed_data_without_keywords['Date'].apply(convert_swedish_to_english)

# Convert 'Date' column to datetime format
processed_data_without_keywords['Date'] = pd.to_datetime(processed_data_without_keywords['Date'], format='%d %B %Y')

# Extract month and year into new columns
processed_data_without_keywords['Month'] = processed_data_without_keywords['Date'].dt.month
processed_data_without_keywords['Year'] = processed_data_without_keywords['Date'].dt.year

In [183]:
# Group by 'Month' and 'Year' and calculate the mean 'F-V Score' for processed_data_without_keywords
avg_fv_score_without_keywords_over_time = processed_data_without_keywords.groupby(['Month', 'Year'])['F-V Score'].mean().reset_index()

# Group by 'Month' and 'Year' and calculate the mean 'F-V Score' for processed_data_with_keywords
avg_fv_score_with_keywords_over_time = processed_data_with_keywords.groupby(['Month', 'Year'])['F-V Score'].mean().reset_index()

# Print the average F-V Scores for both DataFrames
print("\nAverage F-V Score for Articles Without Keywords:")
print(avg_fv_score_without_keywords_over_time)

print("Average F-V Score for Articles With Keywords:")
print(avg_fv_score_with_keywords_over_time)


Average F-V Score for Articles Without Keywords:
   Month  Year  F-V Score
0      5  2023   0.332220
1      7  2023   0.394948
2      8  2023   0.223389
3      9  2023   0.293168
4     10  2023   2.285819
Average F-V Score for Articles With Keywords:
   Month  Year  F-V Score
0      3  2014  60.640395
1      6  2023   0.248309
2      7  2023   0.290840
3     10  2023  10.622259


### Any Keyword Frequency by Month and Year

In [184]:
# Convert Times on processed_data_keyword_coded
# Apply the conversion function to the 'Date' column
processed_data_keyword_coded['Date'] = processed_data_keyword_coded['Date'].apply(convert_swedish_to_english)

# Convert 'Date' column to datetime format
processed_data_keyword_coded['Date'] = pd.to_datetime(processed_data_keyword_coded['Date'], format='%d %B %Y')

# Extract month and year into new columns
processed_data_keyword_coded['Month'] = processed_data_keyword_coded['Date'].dt.month
processed_data_keyword_coded['Year'] = processed_data_keyword_coded['Date'].dt.year

In [193]:
# Check each row for the presence of any keywords; if one is there, make 'Keyword Present' into 1; otherwise make it 0
processed_data_keyword_coded['Keyword Present'] = processed_data_keyword_coded[keywords].any(axis=1).astype(int)

In [200]:
# Group the data by 'Year' and 'Month', then calculate the mean of 'Keyword Present' column and count the number of rows in each group
keyword_present_over_time = processed_data_keyword_coded.groupby(['Year', 'Month'])['Keyword Present'].agg(['mean', 'size']).reset_index()

# Rename the columns for clarity
keyword_present_over_time.columns = ['Year', 'Month', 'Keyword Present Proportion', 'Number of Rows']

# 'result' now contains the table showing the proportion of rows with a 1 in 'Keyword Present' and the number of rows for each month within each year
print(keyword_present_over_time)

   Year  Month  Keyword Present Proportion  Number of Rows
0  2014      3                    1.000000               2
1  2023      5                    0.000000               2
2  2023      6                    1.000000               2
3  2023      7                    0.666667               6
4  2023      8                    0.000000               2
5  2023      9                    0.000000               4
6  2023     10                    0.500000              22


# Create Dataframes to Save Everything

In [203]:
type(keyword_present_over_time)

pandas.core.frame.DataFrame

In [204]:
type(avg_fv_score_without_keywords_over_time)

pandas.core.frame.DataFrame

In [206]:
type(avg_fv_score_with_keywords_over_time)

pandas.core.frame.DataFrame

In [207]:
average_fv_score_with_keywords

12.620295631258108

In [None]:
# Calculate F-V Score for each row of the processed_data_keyword_coded data
processed_data_keyword_coded['F-V Score'] = (1 - ((processed_data_keyword_coded['Fear Class 0 Probability'] + processed_data_keyword_coded['Violence Class 0 Probability']) / 2)) * 100

In [216]:
# Add a collection of the keywords in each text to each row of processed_data_keyword_coded
processed_data_keyword_coded['Keywords'] = processed_data_keyword_coded['Text'].apply(lambda text: [keyword for keyword in keywords if keyword.lower() in text.lower()])

In [217]:
processed_data_keyword_coded

Unnamed: 0,ID,Fear Class 0 Probability,Fear Class 1 Probability,Fear Class 2 Probability,Violence Class 0 Probability,Violence Class 1 Probability,Violence Class 2 Probability,Date,DocumentType,Source,...,Tolerans,Migrationsverket,Inkludering,Turkiska,Integrationspolitik,Month,Year,Keyword Present,F-V Score,Keywords
0,0,0.996384,0.002557,0.001059,0.999454,0.000237,0.000309,2023-10-25,Ärendeförteckning,"Arbetsmarknadsdepartementet,Finansdepartemente...",...,0,0,0,0,0,10,2023,0,0.208095,[]
1,1,0.985025,0.013341,0.001633,0.997682,0.000794,0.001524,2023-10-25,Kommenterad dagordning,Landsbygds- och infrastrukturdepartementet,...,0,0,0,0,0,10,2023,0,0.864657,[]
2,2,0.994334,0.003724,0.001943,0.997236,0.001277,0.001487,2023-10-25,Remiss,Justitiedepartementet,...,0,0,0,0,0,10,2023,1,0.421503,[Diskriminering]
3,3,0.754918,0.238354,0.006728,0.982671,0.003774,0.013556,2023-10-25,"Departementsserien och promemorior,Rättsliga d...",Justitiedepartementet,...,0,1,0,0,0,10,2023,1,13.120582,"[Utlänning, Anpassning, Utvisning, Migrationsv..."
4,4,0.852181,0.085894,0.061925,0.998921,0.000461,0.000619,2023-10-25,Regeringsuppdrag,"Justitiedepartementet,Regeringen",...,0,0,0,0,0,10,2023,0,7.444933,[]
5,5,0.99627,0.002704,0.001027,0.998764,0.000592,0.000644,2023-06-27,Remiss,Arbetsmarknadsdepartementet,...,0,1,0,0,0,6,2023,1,0.248309,"[Segregation, Diskriminering, Integration, Mig..."
6,6,0.995365,0.003028,0.001607,0.998342,0.000822,0.000836,2023-09-08,Remiss,Försvarsdepartementet,...,0,0,0,0,0,9,2023,0,0.314632,[]
7,7,0.093567,0.856172,0.050261,0.693625,0.25279,0.053585,2014-03-25,"Departementsserien och promemorior,Rättsliga d...",Kulturdepartementet,...,0,0,1,0,1,3,2014,1,60.640395,"[Invandring, Rasism, Utlänning, Segregation, A..."
8,8,0.845011,0.113078,0.041911,0.992598,0.002403,0.005,2023-10-24,"Proposition,Rättsliga dokument","Landsbygds- och infrastrukturdepartementet,Reg...",...,1,0,0,0,0,10,2023,1,8.119575,"[Anpassning, Integration, Flyktingar, Gränskon..."
9,9,0.985345,0.008876,0.005779,0.999192,0.000291,0.000517,2023-10-24,"Proposition,Rättsliga dokument",Socialdepartementet,...,0,0,0,0,0,10,2023,0,0.773138,[]


Date: 10/24/23
Ryan Wolff