In [205]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [206]:
# Load dataset
df = pd.read_csv("stock_data.csv") 
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,neutral-count-finbert,positive-count-finbert,negative-count-finbert,average-confidence-finbert,average-neutral-score-gemini,average-positive-score-gemini,average-negative-score-gemini,prediction-label,ticker,number-employees,date_object,sector
0,2024-11-08,299.160004,328.700012,297.660004,321.220001,321.220001,201778989,3,1,4,0.995374,6.875,4.375,3.500,1.0,TSLA,140473.0,,Automotive
1,2024-11-08,227.139999,228.660004,226.408401,226.960007,226.960007,37608324,2,2,4,0.893756,6.875,5.375,3.500,0.0,AAPL,164000.0,,Technology
2,2024-11-08,2.320000,2.360000,2.100000,2.210000,2.210000,102274174,5,1,2,0.912988,7.125,5.250,2.375,0.0,LCID,6500.0,,Automotive
3,2024-11-08,27.110001,27.150000,26.718100,26.719999,26.719999,53549746,3,3,2,0.926920,6.500,5.750,2.500,0.0,PFE,88000.0,,Healthcare
4,2024-11-08,40.740002,40.849998,40.419998,40.480000,40.480000,21278511,1,1,6,0.982125,8.250,4.125,1.750,0.0,VZ,101200.0,,Telecommunications
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
699,2024-12-02,175.779999,176.899994,173.770004,176.809998,176.809998,1868440,3,0,5,0.987520,5.000,6.750,1.750,1.0,ZTS,14100.0,,Healthcare
700,2024-12-02,133.529999,133.770004,132.380005,132.889999,132.889999,2372502,3,0,5,0.998836,5.750,6.375,1.250,0.0,MMM,85000.0,,Industrials
701,2024-12-02,39.480000,39.525002,38.294998,38.369999,38.369999,5134739,3,1,4,0.943777,6.250,4.500,3.750,0.0,EXC,19962.0,,Energy
702,2024-12-02,85.519997,85.635002,84.120003,85.129997,85.129997,2536014,5,1,2,0.994591,3.000,6.875,1.500,0.0,FIS,60000.0,,Finance


In [207]:
# Ensure the data is sorted by date
df['date_object'] = pd.to_datetime(df['date_object'])
df = df.sort_values('date_object')

# Debugging: Check the dataset size before and after creating lag features
print("Initial dataset size:", df.shape)

Initial dataset size: (704, 19)


In [208]:
# Drop specific columns
df = df.drop(columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'date_object'])
# df = df[df['ticker'] == 'TSLA']

In [209]:
# Create lag features (autoregressive)
df['prediction-1-day-ago'] = df['prediction-label'].shift(1)
df['prediction-2-days-ago'] = df['prediction-label'].shift(2)

# Fill NaN values in the lagged columns with a constant (e.g., 0)
df['prediction-1-day-ago'] = df['prediction-1-day-ago'].fillna(0)
df['prediction-2-days-ago'] = df['prediction-2-days-ago'].fillna(0)

# Debugging: Check the dataset size after filling NaN values
print("Dataset size after filling NaN values:", df.shape)
df

Dataset size after filling NaN values: (704, 13)


Unnamed: 0,neutral-count-finbert,positive-count-finbert,negative-count-finbert,average-confidence-finbert,average-neutral-score-gemini,average-positive-score-gemini,average-negative-score-gemini,prediction-label,ticker,number-employees,sector,prediction-1-day-ago,prediction-2-days-ago
0,3,1,4,0.995374,6.875,4.375,3.500,1.0,TSLA,140473.0,Automotive,0.0,0.0
1,2,2,4,0.893756,6.875,5.375,3.500,0.0,AAPL,164000.0,Technology,1.0,0.0
2,5,1,2,0.912988,7.125,5.250,2.375,0.0,LCID,6500.0,Automotive,0.0,1.0
3,3,3,2,0.926920,6.500,5.750,2.500,0.0,PFE,88000.0,Healthcare,0.0,0.0
4,1,1,6,0.982125,8.250,4.125,1.750,0.0,VZ,101200.0,Telecommunications,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
699,3,0,5,0.987520,5.000,6.750,1.750,1.0,ZTS,14100.0,Healthcare,0.0,1.0
700,3,0,5,0.998836,5.750,6.375,1.250,0.0,MMM,85000.0,Industrials,1.0,0.0
701,3,1,4,0.943777,6.250,4.500,3.750,0.0,EXC,19962.0,Energy,0.0,1.0
702,5,1,2,0.994591,3.000,6.875,1.500,0.0,FIS,60000.0,Finance,0.0,0.0


In [210]:
# Feature set and target
features = ['prediction-1-day-ago', 'prediction-2-days-ago', 'neutral-count-finbert', 
            'positive-count-finbert', 'negative-count-finbert', 
            'average-confidence-finbert']
X = df[features]
y = df['prediction-label'] 

# Debugging: Ensure X and y are not empty
print("Feature set size:", X.shape)
print("Target size:", y.shape)

Feature set size: (704, 6)
Target size: (704,)


In [211]:
# Split into training and testing sets
if len(X) < 2:
    raise ValueError("Not enough samples to split. Increase dataset size or adjust feature selection.")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [212]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [213]:
# Define and train a logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

In [214]:
# Predictions
y_pred = model.predict(X_test_scaled)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.67      0.46      0.54        74
         1.0       0.56      0.75      0.64        67

    accuracy                           0.60       141
   macro avg       0.61      0.60      0.59       141
weighted avg       0.61      0.60      0.59       141

