In [15]:
! pip install pandas scikit-learn datasets transformers



In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer

In [None]:
file_path = r"FinancialPhraseBank-v1.0\FinancialPhraseBank-v1.0\Sentences_66Agree.txt"

# Use cp1252 encoding instead of utf-8
with open(file_path, 'r', encoding='cp1252') as f:
    lines = f.readlines()

# Inspect
print(lines[:5])

sentences = []
labels = []

for line in lines:
    line = line.strip()
    if not line:
        continue
    if "@positive" in line:
        label = "positive"
        text = line.replace("@positive", "").strip()
    elif "@negative" in line:
        label = "negative"
        text = line.replace("@negative", "").strip()
    elif "@neutral" in line:
        label = "neutral"
        text = line.replace("@neutral", "").strip()
    else:
        continue  # skip malformed lines

    sentences.append(text)
    labels.append(label)

print(f"Loaded {len(sentences)} sentences.")

# 3️⃣ Map labels to integers
df = pd.DataFrame({"text": sentences, "label": labels})
df['label'] = df['label'].map({"negative": 0, "neutral": 1, "positive": 2})

# Inspect
print(df.head())
print(df['label'].value_counts())


# Split into train/test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])


# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenization
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")

def preprocess(batch):
    return tokenizer(batch['text'], truncation=True, padding='max_length', max_length=128)

train_dataset = train_dataset.map(preprocess, batched=True)
test_dataset = test_dataset.map(preprocess, batched=True)



['According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .@neutral\n', 'Technopolis plans to develop in stages an area of no less than 100,000 square meters in order to host companies working in computer technologies and telecommunications , the statement said .@neutral\n', 'With the new production plant the company would increase its capacity to meet the expected increase in demand and would improve the use of raw materials and therefore increase the production profitability .@positive\n', "According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .@positive\n", "For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .@positive\n"]
Loaded 

config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/3373 [00:00<?, ? examples/s]

Map:   0%|          | 0/844 [00:00<?, ? examples/s]