# All Imports

In [2]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch
import csv

In [3]:
tokenizer = BertTokenizer.from_pretrained('ProsusAI/finbert')
model = BertForSequenceClassification.from_pretrained('ProsusAI/finbert')

In [4]:
# import csv

# def read_multiline_string_from_csv(filename, column_name):
#     with open(filename, 'r+', newline='', encoding="utf8") as csvfile:
#         reader = csv.DictReader(csvfile)
#         header = reader.fieldnames
#         header.append('score')  # Add 'score' as a new column header
#         csvfile.seek(0)
#         writer = csv.DictWriter(csvfile, fieldnames=header)

#         # If 'score' column already exists, remove it temporarily
#         if 'score' in header:
#             header.remove('score')

#         # Write the updated header
#         writer.writeheader()

#         for row in reader:
#             txt = row[column_name]
#             if txt is not None:
#                 sentiment_score = finbert(txt)

#                 # Add 'score' column with its value at the end of the row
#                 row['score'] = sentiment_score
#                 writer.writerow(row)
#                 print(sentiment_score)


# Finbert

In [None]:
def finbert(txt):
    tokens = tokenizer.encode_plus(txt, add_special_tokens = False, return_tensors = 'pt')

    input_id_chunks = tokens['input_ids'][0].split(510)
    attention_mask_chunks = tokens['attention_mask'][0].split(510)

    input_id_chunks, attention_mask_chunks = get_input_ids_and_attention_mask_chunk(tokens)

    input_ids = torch.stack(input_id_chunks)
    attention_mask = torch.stack(attention_mask_chunks)

    input_dict = {
        'input_ids' : input_ids.long(),
        'attention_mask' : attention_mask.int()
    }

    outputs = model(**input_dict)
    probabilities = torch.nn.functional.softmax(outputs[0], dim = -1 )
    mean_probabilities = probabilities.mean(dim = 0)

    return torch.argmax(mean_probabilities).item()

In [None]:
def get_input_ids_and_attention_mask_chunk(tokens):
    chunksize = 512
    input_id_chunks = list(tokens['input_ids'][0].split(chunksize - 2))
    attention_mask_chunks = list(tokens['attention_mask'][0].split(chunksize - 2))

    for i in range(len(input_id_chunks)):
        input_id_chunks[i] = torch.cat([
            torch.tensor([101]), input_id_chunks[i], torch.tensor([102])
        ])

        attention_mask_chunks[i] = torch.cat([
            torch.tensor([1]), attention_mask_chunks[i], torch.tensor([1])
        ])

        pad_length = chunksize - input_id_chunks[i].shape[0]

        if pad_length > 0:
            input_id_chunks[i] = torch.cat([
                input_id_chunks[i], torch.Tensor([0] * pad_length)
            ])
            attention_mask_chunks[i] = torch.cat([
                attention_mask_chunks[i], torch.Tensor([0] * pad_length)
            ])

    return input_id_chunks, attention_mask_chunks

In [None]:
# # Example usage:
# import pandas as pd
# filename = 'final_news_data.csv'  # Replace 'data.csv' with the path to your CSV file
# column_name = 'Article Body'  # Replace 'column_name' with the name of the column containing the multi-line string
# df = pd.read_csv(filename)
# # multiline_string = read_multiline_string_from_csv(filename, column_name)
# # Apply sentiment analysis and create a new column 'score'
# column_data = df['column_name']
# print(column_data)
# # df['score'] = df['Article Body'].apply(finbert(column_name))
# # # Save the updated DataFrame to a new CSV file
# # output_file_path = 'output_file.csv'  # Replace with your desired output file path
# # df.to_csv(output_file_path, index=False)

In [None]:
# import pandas as pd
# # Assuming you have a CSV file named 'your_data.csv'
# file_path = 'final_news_data.csv'
# # Load the data into a pandas DataFrame
# df = pd.read_csv(file_path)
# column_data = df['Article Body']
# print(column_data)

In [None]:
# import pandas as pd



# # Assuming you have a CSV file named 'your_data.csv'
# file_path = 'merged_data1.csv'

# # Load the data into a pandas DataFrame
# df = pd.read_csv(file_path)

# # Drop rows with NaN values in the 'Article Body' column
# df_without_nan = df.dropna(subset=['Article Body'])

# # Replace 'column_name' with the actual column name you want to extract
# column_data_without_nan = df_without_nan['Article Body']

# # Apply the finbert function to each row in the 'Article Body' column
# df_without_nan['score'] = df_without_nan['Article Body'].apply(finbert)

# # Save the DataFrame back to the CSV file with the new 'score' column
# df_without_nan.to_csv(file_path, index=False)

# # Print or use the extracted data as needed
# print(df_without_nan['score'])


In [None]:
# import pandas as pd

# # Assuming you have a CSV file named 'merged_data.csv'
# file_path = 'RI_TOI.csv'

# # Load the data into a pandas DataFrame
# df = pd.read_csv(file_path)

# # Drop rows with NaN values in the 'Article Body' column
# df_without_nan = df.dropna(subset=['Article Body'])

# # Replace 'column_name' with the actual column name you want to extract
# column_data_without_nan = df_without_nan['Article Body']

# # Apply the finbert function to each row in the 'Article Body' column
# df_without_nan['score'] = df_without_nan['Article Body'].apply(finbert)

# # Specify the new file path for the output CSV
# output_file_path = 'RI_TOI_output_data.csv'

# # Save the DataFrame to a new CSV file with the new 'score' column
# df_without_nan.to_csv(output_file_path, index=False)

# # Print or use the extracted data as needed
# print(df_without_nan['score'])


# Apply Finbert on Article Body and Generate Output CSV

In [None]:
import pandas as pd
from tqdm import tqdm

# Assuming you have a CSV file named 'merged_data.csv'
file_path = 'merged_data_f.csv'

# Load the data into a pandas DataFrame
df = pd.read_csv(file_path)

# Drop rows with NaN values in the 'Article Body' column
df_without_nan = df.dropna(subset=['Article Body'])

# Replace 'column_name' with the actual column name you want to extract
column_data_without_nan = df_without_nan['Article Body']

# Create an empty list to store scores
scores = []

# Iterate over each row in the 'Article Body' column and calculate scores
for text in tqdm(column_data_without_nan, desc="Calculating scores", unit=" articles"):
    score = finbert(text)
    scores.append(score)

# Add the scores to the DataFrame
df_without_nan['score'] = scores

# Specify the new file path for the output CSV
output_file_path = 'merged_data_f_output.csv'

# Save the DataFrame to a new CSV file with the new 'score' column
df_without_nan.to_csv(output_file_path, index=False)

# Print or use the extracted data as needed
print(df_without_nan['score'])


Calculating scores:   0%|          | 0/1141 [00:00<?, ? articles/s]Token indices sequence length is longer than the specified maximum sequence length for this model (758 > 512). Running this sequence through the model will result in indexing errors
Calculating scores: 100%|██████████| 1141/1141 [56:54<00:00,  2.99s/ articles]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_without_nan['score'] = scores


0       1
19      2
20      0
22      0
25      0
       ..
1553    0
1554    2
1555    2
1556    2
1558    2
Name: score, Length: 1141, dtype: int64


In [None]:
# import pandas as pd

# # Assuming you have a CSV file named 'data.csv'
# file_path = 'RI_TOI_output_data.csv'

# # Load the data into a pandas DataFrame
# df = pd.read_csv(file_path)

# # Specify the column name for which you want to find non-null values
# column_name = 'not_null'

# # Get the indices of rows where the specified column is not null
# non_null_indices = df[df[column_name].notnull()].index

# # Print the row numbers with non-null values for the specified column
# print("Row numbers with non-null values in column '{}':".format(column_name))
# print(non_null_indices)
