In [1]:
import glob

files = glob.glob("model_data/*_merged_sentiment.csv")
print("Found files:", files)


Found files: ['model_data/AAPL_merged_sentiment.csv', 'model_data/JPM_merged_sentiment.csv', 'model_data/TSLA_merged_sentiment.csv', 'model_data/MSFT_merged_sentiment.csv', 'model_data/NVDA_merged_sentiment.csv']


In [2]:
import pandas as pd

if files:
    df_list = [pd.read_csv(f) for f in files]
    df = pd.concat(df_list, ignore_index=True)
    df["timestamp"] = pd.to_datetime(df["Datetime"])
else:
    print("❌ No merged sentiment CSV files found. Please check your file paths.")


In [3]:
df.head()

Unnamed: 0,Datetime,Close,High,Low,Open,Volume,Ticker,ticker,sentiment_score,timestamp
0,2025-06-13 13:30:00+00:00,198.899994,200.369995,198.645004,199.509995,4848028,AAPL,AAPL,0.0,2025-06-13 13:30:00+00:00
1,2025-06-13 13:35:00+00:00,198.270004,199.240005,197.809998,198.889999,1210436,AAPL,AAPL,0.0,2025-06-13 13:35:00+00:00
2,2025-06-13 13:40:00+00:00,197.5,198.289993,197.270004,198.269897,1258742,AAPL,AAPL,0.0,2025-06-13 13:40:00+00:00
3,2025-06-13 13:45:00+00:00,197.384995,197.600006,196.899994,197.529999,1163972,AAPL,AAPL,0.0,2025-06-13 13:45:00+00:00
4,2025-06-13 13:50:00+00:00,196.940399,197.660004,196.899994,197.380005,862739,AAPL,AAPL,0.0,2025-06-13 13:50:00+00:00


In [4]:
# Sort to align future close prices
df = df.sort_values(["ticker", "timestamp"])

# Target = 1 if price will go up in next interval, else 0
df["target"] = (df.groupby("ticker")["Close"].shift(-1) > df["Close"]).astype(int)

# Drop last row for each ticker where target is NaN
df.dropna(subset=["target"], inplace=True)


In [5]:
df.head()

Unnamed: 0,Datetime,Close,High,Low,Open,Volume,Ticker,ticker,sentiment_score,timestamp,target
0,2025-06-13 13:30:00+00:00,198.899994,200.369995,198.645004,199.509995,4848028,AAPL,AAPL,0.0,2025-06-13 13:30:00+00:00,0
418,2025-06-13 13:30:00+00:00,198.899994,200.369995,198.645004,199.509995,4848028,AAPL,AAPL,0.0,2025-06-13 13:30:00+00:00,0
836,2025-06-13 13:30:00+00:00,198.899994,200.369995,198.645004,199.509995,4848028,AAPL,AAPL,0.0,2025-06-13 13:30:00+00:00,0
1254,2025-06-13 13:30:00+00:00,198.899994,200.369995,198.645004,199.509995,4848028,AAPL,AAPL,0.0,2025-06-13 13:30:00+00:00,0
1672,2025-06-13 13:30:00+00:00,198.899994,200.369995,198.645004,199.509995,4848028,AAPL,AAPL,0.0,2025-06-13 13:30:00+00:00,0


In [6]:
df["price_change"] = df["Close"] - df["Open"]
df["volatility"] = df["High"] - df["Low"]
df["hour"] = df["timestamp"].dt.hour
df["minute"] = df["timestamp"].dt.minute

features = ["Open", "High", "Low", "Close", "Volume", "sentiment_score",
            "price_change", "volatility", "hour", "minute"]


In [7]:
df.head()

Unnamed: 0,Datetime,Close,High,Low,Open,Volume,Ticker,ticker,sentiment_score,timestamp,target,price_change,volatility,hour,minute
0,2025-06-13 13:30:00+00:00,198.899994,200.369995,198.645004,199.509995,4848028,AAPL,AAPL,0.0,2025-06-13 13:30:00+00:00,0,-0.610001,1.724991,13,30
418,2025-06-13 13:30:00+00:00,198.899994,200.369995,198.645004,199.509995,4848028,AAPL,AAPL,0.0,2025-06-13 13:30:00+00:00,0,-0.610001,1.724991,13,30
836,2025-06-13 13:30:00+00:00,198.899994,200.369995,198.645004,199.509995,4848028,AAPL,AAPL,0.0,2025-06-13 13:30:00+00:00,0,-0.610001,1.724991,13,30
1254,2025-06-13 13:30:00+00:00,198.899994,200.369995,198.645004,199.509995,4848028,AAPL,AAPL,0.0,2025-06-13 13:30:00+00:00,0,-0.610001,1.724991,13,30
1672,2025-06-13 13:30:00+00:00,198.899994,200.369995,198.645004,199.509995,4848028,AAPL,AAPL,0.0,2025-06-13 13:30:00+00:00,0,-0.610001,1.724991,13,30


In [8]:

X = df[features]
y = df["target"]


In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

In [11]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
#from xgboost import XGBRegressor

In [28]:
from src.Sentiment_Driven_stock_price_movement_predictor.utils import metrics_model,evaluate_models

In [27]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    #"XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
params={
                "Decision Tree": {
                    'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
                    # 'splitter':['best','random'],
                    # 'max_features':['sqrt','log2'],
                },
                "Lasso": {"alpha": [0.1, 1, 10]},
                "Random Forest":{
                    # 'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
                 
                    # 'max_features':['sqrt','log2',None],
                    'n_estimators': [8,16,32,64,128,256]
                },
                "Gradient Boosting":{
                    # 'loss':['squared_error', 'huber', 'absolute_error', 'quantile'],
                    'learning_rate':[.1,.01,.05,.001],
                    'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
                    # 'criterion':['squared_error', 'friedman_mse'],
                    # 'max_features':['auto','sqrt','log2'],
                    'n_estimators': [8,16,32,64,128,256]
                },
                "Linear Regression":{},
                #"XGBRegressor":{
                    #'learning_rate':[.1,.01,.05,.001],
                    #'n_estimators': [8,16,32,64,128,256]
                #},
                "CatBoosting Regressor":{
                    'depth': [6,8,10],
                    'learning_rate': [0.01, 0.05, 0.1],
                    'iterations': [30, 50, 100]
                },
                "AdaBoost Regressor":{
                    'learning_rate':[.1,.01,0.5,.001],
                    # 'loss':['linear','square','exponential'],
                    'n_estimators': [8,16,32,64,128,256]
                },
                "Ridge": {"alpha": [0.1, 1, 10]},
                "K-Neighbors Regressor": {"n_neighbors": [3, 5, 7]},
                "Random Forest Regressor": {"n_estimators": [50, 100]},
                
            }
model_list = []
r2_list =[]
model_report:dict=evaluate_models(X_train,y_train,X_test,y_test,models,params)

            ## To get best model score from dict
best_model_score = max(sorted(model_report.values()))

             ## To get best model name from dict

best_model_name = list(model_report.keys())[
                list(model_report.values()).index(best_model_score)
            ]
best_model = models[best_model_name]

print("This is the best model:")
print(best_model_name)

model_names = list(params.keys())

actual_model=""

for model in model_names:
    if best_model_name == model:
        actual_model = actual_model + model

best_params = params[actual_model]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = metrics_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = metrics_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T
  return linalg.solve(A, Xy, assume_a="pos", overwrite_a=True).T


This is the best model:
AdaBoost Regressor
Linear Regression
Model performance for Training set
- Root Mean Squared Error: 0.2897
- Mean Absolute Error: 0.1678
- R2 Score: 0.0023
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.2817
- Mean Absolute Error: 0.1443
- R2 Score: -0.0080


Lasso
Model performance for Training set
- Root Mean Squared Error: 0.2900
- Mean Absolute Error: 0.1682
- R2 Score: 0.0003
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.2814
- Mean Absolute Error: 0.1649
- R2 Score: -0.0060


Ridge
Model performance for Training set
- Root Mean Squared Error: 0.2897
- Mean Absolute Error: 0.1678
- R2 Score: 0.0023
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.2815
- Mean Absolute Error: 0.1444
- R2 Score: -0.0070


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 0.2788
- Mean Absolute Error: 0.1

Dumping the Data to pickle file to use it in other files

In [29]:
import pandas as pd

# Save to CSV or pickle
X.to_csv("model_data/X.csv", index=False)
y.to_csv("model_data/y.csv", index=False)  # if y is a Series

# Or use pickle for faster I/O and type safety
import joblib
joblib.dump(X, "model_data/X.pkl")
joblib.dump(y, "model_data/y.pkl")


['model_data/y.pkl']