In [None]:
import pandas as pd
from transformers import pipeline, AutoTokenizer

file_path = "processed_wsb.csv"
df = pd.read_csv(file_path)

pipe = pipeline("text-classification", model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis", device=0)

tokenizer = AutoTokenizer.from_pretrained("mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis")

# Define a function to truncate the text properly based on token count
def analyze_sentiment(text):
    # Encode the text with truncation and max length
    encoded_text = tokenizer(text, truncation=True, max_length=512, return_tensors="pt")
    result = pipe(tokenizer.decode(encoded_text['input_ids'][0], skip_special_tokens=True))[0]
    return result

# Check if MergedColumn exists in the data
if 'MergedColumn' not in df.columns:
    raise ValueError("The file does not contain a 'MergedColumn' column.")

# Apply the pipeline to the MergedColumn with proper token-level truncation
results = df['MergedColumn'].apply(lambda x: analyze_sentiment(str(x)))

# Add sentiment and confidence as new columns
df['SentimentLabel'] = results.apply(lambda x: x['label'])
df['Confidence'] = results.apply(lambda x: x['score'])

# Save the updated DataFrame back to a CSV
output_file = "labeled_wsb.csv"
df.to_csv(output_file, index=False)

print(f"Processed file saved successfully to {output_file}")


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Processed file saved successfully to labeled_wsb.csv
