In [68]:
#import libraries

import os
import numpy as np
import csv
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.dates import DateFormatter
import torch
import math
import time
import re
from nltk.corpus import stopwords
import nltk
from tqdm import tqdm
from transformers import pipeline
from tqdm import tqdm


In [None]:
# Check for GPU availability
device = 0 if torch.cuda.is_available() else -1

# Load your DataFrame
df = pd.read_csv('sentiment_analysis_results.csv')
df['Date'] = pd.to_datetime(df['Date']).dt.date
# Separate positive and negative scores
df['positive_score'] = df.apply(lambda row: row['Sentiment Score'] if row['Sentiment'] == 'POSITIVE' else 0, axis=1)
df['negative_score'] = df.apply(lambda row: row['Sentiment Score'] if row['Sentiment'] == 'NEGATIVE' else 0, axis=1)


label_columns = ['automotive', 'nonsense', 'energy', 'lifestyle', 'environment', 'travel', 'finance', 'science', 'politics']


def set_top3_labels(row):
    scores = row[label_columns].astype(float)  # Ensure all scores are float
    if scores.isnull().all():
        return row
    top3_indices = scores.nlargest(3).index
    for col in label_columns:
        row[col] = 1 if col in top3_indices else 0
    return row

# Apply the function to each row
df = df.apply(set_top3_labels, axis=1)

# Group by 'Date' and calculate the average scores
grouped_df = df.groupby('Date').agg({
    'positive_score': 'mean',
    'negative_score': 'mean',
    'automotive': 'mean',
    'nonsense': 'mean',
    'energy': 'mean',
    'lifestyle': 'mean',
    'environment': 'mean',
    'travel': 'mean',
    'finance': 'mean',
    'science': 'mean',
    'politics': 'mean'
}).reset_index()

# Rename columns to match the required format
grouped_df.columns = [
    'Date', 'Average Positive Score', 'Average Negative Score',
    'automotive', 'nonsense', 'energy', 'lifestyle', 'environment',
    'travel', 'finance', 'science', 'politics'
]

# Display the new DataFrame
print(grouped_df)
# Save the new DataFrame to a CSV file
grouped_df.to_csv('data_prep1.csv', index=False)

In [None]:
stock_name = 'TSLA'
grouped_df = pd.read_csv('data_prep1.csv')
grouped_df['Date'] = pd.to_datetime(grouped_df['Date']).dt.date
df_stock_price = pd.read_csv('data\stock_yfinance_data.csv')
df_stock_price = df_stock_price[df_stock_price['Stock Name'] == stock_name ]
df_stock_price['Date'] = pd.to_datetime(df_stock_price['Date']).dt.date
def get_tech_ind(data):
    data['MA7'] = data['Close'].rolling(window=7).mean()
    data['MA20'] = data['Close'].rolling(window=20).mean()
    data['MACD'] = data['Close'].ewm(span=26).mean() - data['Open'].ewm(span=12, adjust=False).mean()
    data['20SD'] = data['Close'].rolling(window=20).std()
    data['upper_band'] = data['MA20'] + (data['20SD'] * 2)
    data['lower_band'] = data['MA20'] - (data['20SD'] * 2)
    data['EMA'] = data['Close'].ewm(com=0.5).mean()
    data['logmomentum'] = np.log(data['Close'] / data['Close'].shift(1))
    return data
label_df = get_tech_ind(df_stock_price)

nearest_trading_days = label_df['Date'].sort_values().reset_index(drop=True)
grouped_df['Nearest Trading Day'] = grouped_df['Date'].apply(lambda x: nearest_trading_days[nearest_trading_days <= x].max())

# Merge tweet data onto stock price data using the 'Nearest Trading Day'
final_df = pd.merge(grouped_df,label_df , left_on='Nearest Trading Day', right_on='Date', how='left')
print(final_df.columns)

In [None]:



## Columns to shift
final_df  = final_df.drop(columns = "Stock Name")
price_columns = list(final_df.columns)
price_columns.remove('Date_x')
price_columns.remove('Date_y')
price_columns.remove('Nearest Trading Day')
print(price_columns)
# Create new columns with the shifted values
for col in price_columns:
    final_df[f'Prev_{col}'] = final_df[col].shift(1)

# Remove the first row which will have NaNs after shifting
final_df = final_df.iloc[1:]

# Drop the original price columns if needed
price_columns.append('Date_x')
price_columns.append('Nearest Trading Day')
price_columns.remove('Close')
final_df  = final_df.drop(columns=price_columns)

# Display the new DataFrame
print(final_df)


# Save the new DataFrame to a CSV file
final_df.to_csv('data_prep2.csv', index=False)

In [None]:
final_df= pd.read_csv('data_prep2.csv')
final_df= final_df.sort_values('Date_y')
#test for non-tweets
"""tmp= ['Prev_Average Positive Score','Prev_Average Negative Score','Prev_automotive','Prev_nonsense','Prev_energy','Prev_lifestyle','Prev_environment'
      ,'Prev_travel','Prev_finance','Prev_science','Prev_politics']
final_df  = final_df.drop(columns=tmp)
"""
split_ratio = 0.8
split_point = int(len(final_df) * split_ratio)

train = final_df[:split_point]
test = final_df[split_point:]

X_train = train.drop(['Date_y', 'Close'], axis=1)
y_train = train['Close']
X_test = test.drop(['Date_y', 'Close'], axis=1)
y_test = test['Close']


In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
# Create DMatrix for XGBoost

param_grid = {
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300, 400],
    'subsample': [0.6, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.8, 0.9, 1.0]
}

xgb_model = xgb.XGBRegressor(objective='reg:squarederror')

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', cv=3, verbose=1, n_jobs=-1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding score
print("Best parameters found: ", grid_search.best_params_)
print("Lowest RMSE found: ", (-grid_search.best_score_) ** 0.5)

# Train the model with the best parameters
best_model = grid_search.best_estimator_

# Make predictions
predictions = best_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
print(f'Mean Squared Error: {mse}')

In [None]:
plt.figure(figsize=(14, 7))
plt.plot(test['Date_y'], y_test, label='Actual', color='blue')
plt.plot(test['Date_y'], predictions, label='Predicted', color='red')
plt.xlabel('Date')
plt.ylabel('Close Price')
plt.title('Actual vs Predicted Close Prices')
plt.legend()
plt.show()