In [None]:
import pandas as pd
import json
import os

def load_data(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    tweets = []
    total_tweets = len(data)
    for index, item in enumerate(data):
        tweet_id = item['Tweet_ID']
        stock_ticker = item['Target_Ticker']
        sentiment = item['Sentiment']
        tweets.append({'tweet_id': tweet_id, 'stock_ticker': stock_ticker, 'sentiment': sentiment})
    return pd.DataFrame(tweets)

def get_tweet_text(row):
    tweet_id = row['tweet_id']
    try:
        # Use snscrape with the tweet ID through the command line
        tweet = os.popen(f'snscrape --jsonl --max-results 1 twitter-tweet {tweet_id}').read()
        tweet_json = json.loads(tweet)
        print(f'Successfully fetched tweet: {tweet_id} ({row.name+1}/{len(train_data)})')  # Log progress
        return tweet_json['content']
    except Exception as e:
        print(f'Error fetching tweet {tweet_id}: {str(e)}')  # Console log for errors
        return None

# Now, load the data
train_data = load_data('TweetFinSent_Train.json')
test_data = load_data('TweetFinSent_Test.json')

# And use the function to get tweet texts
print('Fetching tweets for train_data...')  # Console log
train_data['tweet_text'] = train_data.apply(get_tweet_text, axis=1)
print('Fetching tweets for test_data...')  # Console log
test_data['tweet_text'] = test_data.apply(get_tweet_text, axis=1)

# Convert train_data and test_data to CSV
train_data.to_csv('train_data.csv', index=False)
test_data.to_csv('test_data.csv', index=False)

print('Data saved to CSV.')


In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
import joblib
import requests
from bs4 import BeautifulSoup
from sklearn.ensemble import RandomForestClassifier

# Load the data from CSV
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

# Preprocessing: remove rows with missing tweet_text
train_data = train_data.dropna(subset=['tweet_text'])
test_data = test_data.dropna(subset=['tweet_text'])



# Vectorization and Model building with Random Forest
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
])

# Rest of the code remains same

# Training the model
X_train = train_data['tweet_text']
y_train = train_data['sentiment']
pipeline.fit(X_train, y_train)

# Evaluation
X_test = test_data['tweet_text']
y_test = test_data['sentiment']
y_pred = pipeline.predict(X_test)
print("Model Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save the model to disk
filename = 'finalized_model.sav'
joblib.dump(pipeline, filename)

# Deployment - Function to get Google finance news and predict sentiment
def get_google_finance_news(stock_ticker):
    url = f'https://www.google.com/search?q={stock_ticker}+stock&tbm=nws'
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    news_items = soup.find_all('div', class_='st')
    model = joblib.load(filename)
    sentiments = []
    for news in news_items:
        sentiment = model.predict([news.get_text()])[0]
        sentiments.append({
            'news_text': news.get_text(),
            'predicted_sentiment': sentiment
        })
    return sentiments

# Example usage:
stock_ticker = "LCID" # You can replace this with any stock ticker of your choice
news_sentiments = get_google_finance_news(stock_ticker)
for item in news_sentiments:
    print(item)


Model Accuracy: 0.5914634146341463
              precision    recall  f1-score   support

    NEGATIVE       0.83      0.20      0.32        97
     NEUTRAL       0.57      0.81      0.67       395
    POSITIVE       0.61      0.45      0.52       328

    accuracy                           0.59       820
   macro avg       0.67      0.48      0.50       820
weighted avg       0.62      0.59      0.57       820



In [23]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
import joblib
import requests
from bs4 import BeautifulSoup

# Load the data from CSV
try:
    train_data = pd.read_csv('train_data.csv')
    test_data = pd.read_csv('test_data.csv')
except Exception as e:
    print(f"Error loading data: {e}")
    exit()

# Preprocessing: remove rows with missing tweet_text
train_data = train_data.dropna(subset=['tweet_text'])
test_data = test_data.dropna(subset=['tweet_text'])

# Vectorization and Model building with Random Forest
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
])

# Define the hyperparameters grid
param_grid = {
    'tfidf__max_features': [5000, 10000, None],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10]
}

# Grid Search for hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Use the best estimator
best_pipeline = grid_search.best_estimator_

# Evaluation
try:
    X_test = test_data['tweet_text']
    y_test = test_data['sentiment']
    y_pred = best_pipeline.predict(X_test)
    print("Model Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
except Exception as e:
    print(f"Error during evaluation: {e}")

# Save the model to disk
filename = 'finalized_model.sav'
joblib.dump(best_pipeline, filename)

# Load the model once (outside the function)
try:
    model = joblib.load(filename)
except Exception as e:
    print(f"Error loading the model: {e}")
    exit()

# Deployment - Function to get Google finance news and predict sentiment
def get_google_finance_news(stock_ticker, model):
    try:
        url = f'https://www.google.com/search?q={stock_ticker}+stock&tbm=nws'
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        # Note that the class for news items might change, so this may not work in the future
        news_items = soup.find_all('div', class_='BNeawe vvjwJb AP7Wnd')
        sentiments = []
        for news in news_items:
            sentiment = model.predict([news.get_text()])[0]
            sentiments.append({
                'news_text': news.get_text(),
                'predicted_sentiment': sentiment
            })
        return sentiments
    except Exception as e:
        print(f"Error while scraping and predicting: {e}")
        return []

# Example usage:
stock_ticker = "LCID" # You can replace this with any stock ticker of your choice
news_sentiments = get_google_finance_news(stock_ticker, model)
for item in news_sentiments:
    print(item)


Fitting 3 folds for each of 216 candidates, totalling 648 fits
Model Accuracy: 0.5890243902439024
              precision    recall  f1-score   support

    NEGATIVE       0.72      0.24      0.36        97
     NEUTRAL       0.58      0.80      0.67       395
    POSITIVE       0.58      0.44      0.50       328

    accuracy                           0.59       820
   macro avg       0.63      0.49      0.51       820
weighted avg       0.60      0.59      0.57       820

{'news_text': 'Why Wall Street Is Underestimating Lucid Stock by 70%+', 'predicted_sentiment': 'NEUTRAL'}
{'news_text': 'Why Lucid Group Stock Keeps Going Up', 'predicted_sentiment': 'POSITIVE'}
{'news_text': 'Luxury EV Maker Rallies On Deal With British Icon; Startup Peer Sinks', 'predicted_sentiment': 'NEUTRAL'}
{'news_text': 'Saudi Investment Fund Buys More Lucid Stock. Shares Rise.', 'predicted_sentiment': 'POSITIVE'}
{'news_text': 'Better EV Stock: Lucid Group vs. Canoo', 'predicted_sentiment': 'POSITIVE'}
{'ne

In [21]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch

# Load pre-trained model and tokenizer
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the dataset
train_encodings = tokenizer(list(X_train), truncation=True, padding=True)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True)

# Convert to PyTorch Dataset
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_labels = [0 if label == "NEGATIVE" else 1 if label == "NEUTRAL" else 2 for label in y_train]
test_labels = [0 if label == "NEGATIVE" else 1 if label == "NEUTRAL" else 2 for label in y_test]

train_dataset = TweetDataset(train_encodings, train_labels)
test_dataset = TweetDataset(test_encodings, test_labels)

# Train the model
training_args = TrainingArguments(
    output_dir='./results', # Directory for output files
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

# Evaluate the model
results = trainer.evaluate()

# Evaluate the model
results = trainer.evaluate()


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`