In [3]:
import pandas as pd
from transformers import pipeline

# Load dataset
df = pd.read_csv('/home/ai23mtech14008/Stock Price Prediction new/Dataset/Combined_News_DJIA(train).csv')

# Combine the top 25 news articles for each date into a single string
df['combined_news'] = df.iloc[:, 2:27].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

# Load pre-trained model for sentiment analysis with continuous scores
sentiment_model = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", return_all_scores=True,framework="pt")

# Function to get continuous sentiment score (probability of positive sentiment) with truncation
def get_continuous_sentiment_score(news):
    # Truncate text to 512 tokens if it's too long
    truncated_news = news[:512]
    # Get sentiment scores for both positive and negative classes
    sentiment_scores = sentiment_model(truncated_news)[0]
    # Extract the positive sentiment score
    positive_score = sentiment_scores[1]['score']  # 1 corresponds to the positive sentiment
    return positive_score

# Apply sentiment analysis to get the continuous sentiment score for each row
df['sentiment_score'] = df['combined_news'].apply(get_continuous_sentiment_score)

# Save the updated dataset with sentiment scores
df.to_csv('/home/ai23mtech14008/Stock Price Prediction new/Dataset/Combined_News_DJIA_Sentiment_Continuous.csv', index=False)

print("Sentiment scores have been successfully calculated and saved.")


Sentiment scores have been successfully calculated and saved.


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load stock price data
stock_data = pd.read_csv('/home/ai23mtech14008/Stock Price Prediction new/Dataset/DJIA_table(train).csv')

# Load sentiment score data
sentiment_data = pd.read_csv('/home/ai23mtech14008/Stock Price Prediction new/Dataset/Combined_News_DJIA_Sentiment_Continuous.csv')

# Ensure 'Date' columns are in datetime format for both datasets
stock_data['Date'] = pd.to_datetime(stock_data['Date'], errors='coerce')
sentiment_data['Date'] = pd.to_datetime(sentiment_data['Date'], errors='coerce')

# Merge both datasets on the 'Date' column
merged_data = pd.merge(stock_data, sentiment_data[['Date', 'sentiment_score']], on='Date', how='inner')

# Display the merged data to ensure everything is correct
print(merged_data.head())

# Feature selection: Use open, high, low, volume, and sentiment_score as features
X = merged_data[['Open', 'High', 'Low', 'Volume', 'sentiment_score']]

# Target variable: Close price
y = merged_data['Close']

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features for better performance in linear regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a simple Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model using Root Mean Squared Error (RMSE)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f'Root Mean Squared Error: {rmse}')

# Save the model for future use (optional)
import joblib
joblib.dump(model, 'stock_price_prediction_model_baseline.pkl')


        Date         Open         High          Low        Close    Volume  \
0 2015-12-31  17590.66016  17590.66016  17421.16016  17425.02930  93690000   
1 2015-12-30  17711.93945  17714.13086  17588.86914  17603.86914  59760000   
2 2015-12-29  17547.36914  17750.01953  17547.36914  17720.98047  69860000   
3 2015-12-28  17535.66016  17536.90039  17437.33984  17528.26953  59770000   
4 2015-12-24  17593.25977  17606.33984  17543.94922  17552.16992  40350000   

     Adj Close  sentiment_score  
0  17425.02930         0.005254  
1  17603.86914         0.005044  
2  17720.98047         0.002138  
3  17528.26953         0.002888  
4  17552.16992         0.998259  
Root Mean Squared Error: 50.50451829186233


['stock_price_prediction_model_baseline.pkl']

In [15]:
#Random forest Approach

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load stock price data
stock_data = pd.read_csv('/home/ai23mtech14008/Stock Price Prediction new/Dataset/DJIA_table(train).csv')

# Load sentiment score data
sentiment_data = pd.read_csv('/home/ai23mtech14008/Stock Price Prediction new/Dataset/Combined_News_DJIA_Sentiment_Continuous.csv')

# Ensure 'Date' columns are in datetime format for both datasets
stock_data['Date'] = pd.to_datetime(stock_data['Date'], errors='coerce')
sentiment_data['Date'] = pd.to_datetime(sentiment_data['Date'], errors='coerce')

# Merge both datasets on the 'Date' column
merged_data = pd.merge(stock_data, sentiment_data[['Date', 'sentiment_score']], on='Date', how='inner')

# Check if there are any missing dates or empty rows after the merge
print(f"Number of rows in merged data: {merged_data.shape[0]}")
print(merged_data.head())

# Feature selection: Use Open, High, Low, Volume, and sentiment_score as features
X = merged_data[['Open', 'High', 'Low', 'Volume', 'sentiment_score']]

# Target variable: Close price
y = merged_data['Close']

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features (optional for Random Forest, but it can help)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test_scaled)

# Evaluate the model using Root Mean Squared Error (RMSE)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f'Random Forest RMSE: {rmse}')

# Save the model for future use (optional)
import joblib
joblib.dump(rf_model, 'stock_price_prediction_rf_model.pkl')



Number of rows in merged data: 1639
        Date         Open         High          Low        Close    Volume  \
0 2015-12-31  17590.66016  17590.66016  17421.16016  17425.02930  93690000   
1 2015-12-30  17711.93945  17714.13086  17588.86914  17603.86914  59760000   
2 2015-12-29  17547.36914  17750.01953  17547.36914  17720.98047  69860000   
3 2015-12-28  17535.66016  17536.90039  17437.33984  17528.26953  59770000   
4 2015-12-24  17593.25977  17606.33984  17543.94922  17552.16992  40350000   

     Adj Close  sentiment_score  
0  17425.02930         0.005254  
1  17603.86914         0.005044  
2  17720.98047         0.002138  
3  17528.26953         0.002888  
4  17552.16992         0.998259  
Random Forest RMSE: 87.56834300513538


['stock_price_prediction_rf_model.pkl']

In [16]:
#Hyperparamter Tuning of Random Forest
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load stock price data
stock_data = pd.read_csv('/home/ai23mtech14008/Stock Price Prediction new/Dataset/DJIA_table(train).csv')

# Load sentiment score data
sentiment_data = pd.read_csv('/home/ai23mtech14008/Stock Price Prediction new/Dataset/Combined_News_DJIA_Sentiment_Continuous.csv')

# Ensure 'Date' columns are in datetime format for both datasets
stock_data['Date'] = pd.to_datetime(stock_data['Date'], errors='coerce')
sentiment_data['Date'] = pd.to_datetime(sentiment_data['Date'], errors='coerce')

# Merge both datasets on the 'Date' column
merged_data = pd.merge(stock_data, sentiment_data[['Date', 'sentiment_score']], on='Date', how='inner')

# Check if there are any missing dates or empty rows after the merge
print(f"Number of rows in merged data: {merged_data.shape[0]}")
print(merged_data.head())

# Feature selection: Use Open, High, Low, Volume, and sentiment_score as features
X = merged_data[['Open', 'High', 'Low', 'Volume', 'sentiment_score']]

# Target variable: Close price
y = merged_data['Close']

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Set up the hyperparameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],  # Number of trees
    'max_depth': [3, 5, 10, None],   # Maximum depth of the trees
    'min_samples_split': [2, 5, 10], # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4]    # Minimum number of samples required at each leaf node
}

# Initialize the RandomForestRegressor
rf = RandomForestRegressor(random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

# Fit the grid search to the data
grid_search.fit(X_train_scaled, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print(f"Best Hyperparameters: {best_params}")

# Train the model with the best hyperparameters
best_rf = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_rf.predict(X_test_scaled)

# Evaluate the model using RMSE
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
print(f'Tuned Random Forest RMSE: {rmse}')

# Save the tuned model for future use (optional)
import joblib
joblib.dump(best_rf, 'stock_price_prediction_rf_tuned_model.pkl')


Number of rows in merged data: 1639
        Date         Open         High          Low        Close    Volume  \
0 2015-12-31  17590.66016  17590.66016  17421.16016  17425.02930  93690000   
1 2015-12-30  17711.93945  17714.13086  17588.86914  17603.86914  59760000   
2 2015-12-29  17547.36914  17750.01953  17547.36914  17720.98047  69860000   
3 2015-12-28  17535.66016  17536.90039  17437.33984  17528.26953  59770000   
4 2015-12-24  17593.25977  17606.33984  17543.94922  17552.16992  40350000   

     Adj Close  sentiment_score  
0  17425.02930         0.005254  
1  17603.86914         0.005044  
2  17720.98047         0.002138  
3  17528.26953         0.002888  
4  17552.16992         0.998259  
Fitting 3 folds for each of 108 candidates, totalling 324 fits


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.1s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.1s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.1s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=   0.1s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=   0.3s
[CV] END max_depth=3, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   0.3s
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=5, n_estimators=50; total time=   0.1s
[CV] EN

['stock_price_prediction_rf_tuned_model.pkl']

In [17]:
#Ensemble Methods

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, VotingRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

# Load stock price data
stock_data = pd.read_csv('/home/ai23mtech14008/Stock Price Prediction new/Dataset/DJIA_table(train).csv')

# Load sentiment score data
sentiment_data = pd.read_csv('/home/ai23mtech14008/Stock Price Prediction new/Dataset/Combined_News_DJIA_Sentiment_Continuous.csv')

# Ensure 'Date' columns are in datetime format for both datasets
stock_data['Date'] = pd.to_datetime(stock_data['Date'], errors='coerce')
sentiment_data['Date'] = pd.to_datetime(sentiment_data['Date'], errors='coerce')

# Merge both datasets on the 'Date' column
merged_data = pd.merge(stock_data, sentiment_data[['Date', 'sentiment_score']], on='Date', how='inner')

# Feature selection: Use Open, High, Low, Volume, and sentiment_score as features
X = merged_data[['Open', 'High', 'Low', 'Volume', 'sentiment_score']]

# Target variable: Close price
y = merged_data['Close']

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features for better performance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize individual models
rf = RandomForestRegressor(random_state=42, n_estimators=50)
gbr = GradientBoostingRegressor(random_state=42, n_estimators=50)
etr = ExtraTreesRegressor(random_state=42, n_estimators=50)

# 1. Voting Regressor (Averaging the predictions from all models)
voting_regressor = VotingRegressor([('rf', rf), ('gbr', gbr), ('etr', etr)])
voting_regressor.fit(X_train_scaled, y_train)

# Predict with Voting Regressor
y_pred_voting = voting_regressor.predict(X_test_scaled)
rmse_voting = mean_squared_error(y_test, y_pred_voting, squared=False)
print(f'Voting Regressor RMSE: {rmse_voting}')

# 2. Stacking Regressor (Combining models with a meta-model)
stacking_regressor = StackingRegressor(
    estimators=[('rf', rf), ('gbr', gbr), ('etr', etr)],
    final_estimator=LinearRegression(),  # Using Linear Regression as the meta-model
    cv=3
)
stacking_regressor.fit(X_train_scaled, y_train)

# Predict with Stacking Regressor
y_pred_stacking = stacking_regressor.predict(X_test_scaled)
rmse_stacking = mean_squared_error(y_test, y_pred_stacking, squared=False)
print(f'Stacking Regressor RMSE: {rmse_stacking}')


Voting Regressor RMSE: 84.2521467259597
Stacking Regressor RMSE: 81.20682286883941


In [18]:
#Feature Engineering

import pandas as pd
import numpy as np

# Load the merged data again
merged_data = pd.merge(stock_data, sentiment_data[['Date', 'sentiment_score']], on='Date', how='inner')

# Create new features

# 1. Moving Averages
merged_data['MA_5'] = merged_data['Close'].rolling(window=5).mean()  # 5-day moving average
merged_data['MA_10'] = merged_data['Close'].rolling(window=10).mean()  # 10-day moving average

# 2. Exponential Moving Averages (EMA)
merged_data['EMA_5'] = merged_data['Close'].ewm(span=5, adjust=False).mean()
merged_data['EMA_10'] = merged_data['Close'].ewm(span=10, adjust=False).mean()

# 3. Bollinger Bands
merged_data['stddev_20'] = merged_data['Close'].rolling(window=20).std()
merged_data['upper_band'] = merged_data['MA_10'] + (merged_data['stddev_20'] * 2)
merged_data['lower_band'] = merged_data['MA_10'] - (merged_data['stddev_20'] * 2)

# 4. Rolling Sentiment Scores (Sentiment-driven Features)
merged_data['Sentiment_MA_5'] = merged_data['sentiment_score'].rolling(window=5).mean()
merged_data['Sentiment_MA_10'] = merged_data['sentiment_score'].rolling(window=10).mean()

# 5. Lag Features (Previous day’s values)
merged_data['Prev_Close'] = merged_data['Close'].shift(1)
merged_data['Prev_Volume'] = merged_data['Volume'].shift(1)

# Drop rows with NaN values resulting from rolling calculations
merged_data.dropna(inplace=True)

# Features for the model
X = merged_data[['Open', 'High', 'Low', 'Volume', 'sentiment_score', 'MA_5', 'MA_10', 'EMA_5', 'EMA_10', 
                 'upper_band', 'lower_band', 'Sentiment_MA_5', 'Sentiment_MA_10', 'Prev_Close', 'Prev_Volume']]

# Target variable
y = merged_data['Close']

# Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Stacking Regressor with additional features
stacking_regressor.fit(X_train_scaled, y_train)

# Predict with Stacking Regressor
y_pred_stacking = stacking_regressor.predict(X_test_scaled)
rmse_stacking = mean_squared_error(y_test, y_pred_stacking, squared=False)
print(f'Stacking Regressor RMSE after feature engineering: {rmse_stacking}')


Stacking Regressor RMSE after feature engineering: 77.17983824534542


In [20]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor

# Fit a Random Forest to get feature importance
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train)

# Get feature importance
importances = rf.feature_importances_

# Create a threshold based on feature importance (optional: you can manually select a threshold)
threshold = np.mean(importances)

# Select features above the threshold
selector = SelectFromModel(rf, threshold=threshold, prefit=True)
X_train_selected = selector.transform(X_train_scaled)
X_test_selected = selector.transform(X_test_scaled)

# Refit the Stacking Regressor with selected features
stacking_regressor.fit(X_train_selected, y_train)
y_pred_selected = stacking_regressor.predict(X_test_selected)
rmse_selected = mean_squared_error(y_test, y_pred_selected, squared=False)
print(f'Stacking Regressor RMSE after feature selection: {rmse_selected}')


Stacking Regressor RMSE after feature selection: 75.05597093403256
