In [1]:
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import MinMaxScaler
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
import torch

In [2]:
data = pd.read_csv('../raw_data/final_data.csv')

In [5]:
data.shape

(69137, 16)

In [10]:
# load the pre-trained model and tokenizer
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)

In [11]:
# define a function to classify a single tweet
def classify_tweet(tweet):
    results = nlp(tweet)
    label_scores = {result['label']: result['score'] for result in results}
    positive_score = label_scores.get('Positive', 0.0)
    negative_score = label_scores.get('Negative', 0.0)
    neutral_score = label_scores.get('Neutral', 0.0)
    top_score_label = max(label_scores, key=label_scores.get)
    return pd.Series([positive_score, negative_score, neutral_score, top_score_label])

In [12]:
# apply the classification function to every row in the "tweet" column
small_df = data.sample(n=1000, random_state=1)
small_df[['positive', 'negative', 'neutral', 'sentiment']] = small_df['tweet'].apply(classify_tweet)

In [16]:
data[['positive', 'negative', 'neutral', 'sentiment']] = data['tweet'].apply(classify_tweet)

In [17]:
#writing the data to a csv file
data.to_csv('../raw_data/sentimental_data.csv', index=False)

In [23]:
data['sentiment'].value_counts()

Neutral     52641
Positive    10603
Negative     5893
Name: sentiment, dtype: int64