# IMPORT LIBRARIES

In [None]:
import yfinance as yf
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tabulate import tabulate
import nltk
from math import sqrt

# Downloading, merging and preprocessing non financial dataset.

In [None]:
def download_data(ticker, start_date, end_date):
    try:
        df = yf.download(ticker, start=start_date, end=end_date)
        df = df['Close'].rename(ticker)
        return df
    except Exception as e:
        print(f"Error downloading data for {ticker}: {e}")
        return pd.Series(name=ticker)

end_date = datetime.datetime.now().strftime('%Y-%m-%d')
start_date = (datetime.datetime.now() - datetime.timedelta(days=20*365)).strftime('%Y-%m-%d')

tickers = ['RELIANCE.NS', 'GOLDBEES.NS', 'IOC.NS', 'INR=X']

dataframes = [download_data(ticker, start_date, end_date) for ticker in tickers]

merged_data = pd.concat(dataframes, axis=1).dropna()
merged_data.columns = ['RELIANCE', 'GOLD', 'PETROL', 'CURRENCY']

merged_data.to_csv('merged_data.csv')

if merged_data.isna().any().any():
    print("Warning: NaN values present in the merged data.")
else:
    print("Data downloaded and merged successfully.")

In [None]:
data = pd.read_csv('merged_data.csv')
df = data

In [None]:
data

In [None]:
data.dtypes

In [None]:
def str_to_datetime(s):
  split = s.split('-')
  year, month, day = int(split[0]), int(split[1]), int(split[2])
  return datetime.datetime(year = year, month = month, day = day)
datetime_object = str_to_datetime('2020-01-01')
datetime_object

In [None]:
data['Date'] = data['Date'].apply(str_to_datetime)
data['Date']

In [None]:
data.index = data.pop('Date')
data

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
print(data.isnull().sum())

In [None]:
data.dropna(inplace=True)

In [None]:
print(data.isnull().sum())

In [None]:
data.shape

In [None]:
data.hist()
plt.show()

In [None]:
print(data.corr())

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(15, 5))

for i, col in enumerate(['GOLD', 'PETROL', 'CURRENCY']):
    axs[i].scatter(data['RELIANCE'], data[col])
    axs[i].set_xlabel('RELIANCE')
    axs[i].set_ylabel(col)

plt.tight_layout()
plt.show()


# Linear Regression

In [None]:
# Assuming data is your dataset
X = data[['GOLD', 'PETROL', 'CURRENCY']]
y = data['RELIANCE']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)

# Display actual and predicted values in a table
results_table = pd.DataFrame({
    'Actual RELIANCE': y_test,
    'Predicted RELIANCE': y_pred
})

print("Actual vs. Predicted Values:")
print(results_table)

# Display evaluation metrics
print(f'\nMean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'Mean Absolute Error (MAE): {mae}')

# Plot scatter plots
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))

axes[0].scatter(X_test['GOLD'], y_test, color='blue', label='Actual RELIANCE')
axes[0].scatter(X_test['GOLD'], y_pred, color='red', label='Predicted RELIANCE')
axes[0].set_xlabel('GOLD')
axes[0].set_ylabel('RELIANCE')
axes[0].set_title('GOLD vs. RELIANCE')
axes[0].legend()

axes[1].scatter(X_test['PETROL'], y_test, color='blue', label='Actual RELIANCE')
axes[1].scatter(X_test['PETROL'], y_pred, color='red', label='Predicted RELIANCE')
axes[1].set_xlabel('PETROL')
axes[1].set_ylabel('RELIANCE')
axes[1].set_title('PETROL vs. RELIANCE')
axes[1].legend()

axes[2].scatter(X_test['CURRENCY'], y_test, color='blue', label='Actual RELIANCE')
axes[2].scatter(X_test['CURRENCY'], y_pred, color='red', label='Predicted RELIANCE')
axes[2].set_xlabel('CURRENCY')
axes[2].set_ylabel('RELIANCE')
axes[2].set_title('CURRENCY vs. RELIANCE')
axes[2].legend()

plt.tight_layout()
plt.show()


In [None]:
def calculate_metrics(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    return mse, rmse, r2

X = data[['GOLD', 'PETROL', 'CURRENCY']]
y = data['RELIANCE']

metrics_dict = {'GOLD': {}, 'PETROL': {}, 'CURRENCY': {}}

for feature in X.columns:
    current_feature = X[[feature]]

    X_train, X_test, y_train, y_test = train_test_split(current_feature, y, test_size=0.2, random_state=42)

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mse, rmse, r2 = calculate_metrics(y_test, y_pred)

    metrics_dict[feature]['MSE'] = mse
    metrics_dict[feature]['RMSE'] = rmse
    metrics_dict[feature]['R-squared'] = r2

for feature, metrics in metrics_dict.items():
    print(f'Feature: {feature}')
    print(f'Mean Squared Error: {metrics["MSE"]}')
    print(f'Root Mean Squared Error: {metrics["RMSE"]}')
    print(f'R-squared: {metrics["R-squared"]}')
    print('-' * 30)


In [None]:
def calculate_metrics(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    return mse, rmse, r2

X = data[['GOLD', 'PETROL', 'CURRENCY']]
y = data['RELIANCE']

metrics_dict = {'Feature': [], 'MSE': [], 'RMSE': [], 'R-squared': []}

for feature in X.columns:
    current_feature = X[[feature]]

    X_train, X_test, y_train, y_test = train_test_split(current_feature, y, test_size=0.2, random_state=42)

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mse, rmse, r2 = calculate_metrics(y_test, y_pred)

    metrics_dict['Feature'].append(feature)
    metrics_dict['MSE'].append(mse)
    metrics_dict['RMSE'].append(rmse)
    metrics_dict['R-squared'].append(r2)

results_df = pd.DataFrame(metrics_dict)
print(results_df)


# Random Forest

In [None]:
def calculate_metrics(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    return mse, rmse, r2

X = data[['GOLD', 'PETROL', 'CURRENCY']]
y = data['RELIANCE']

metrics_dict_rf = {'Feature': [], 'MSE': [], 'RMSE': [], 'R-squared': []}

for feature in X.columns:
    current_feature = X[[feature]]

    X_train, X_test, y_train, y_test = train_test_split(current_feature, y, test_size=0.2, random_state=42)

    model_rf = RandomForestRegressor(random_state=42)
    model_rf.fit(X_train, y_train)

    y_pred_rf = model_rf.predict(X_test)

    mse_rf, rmse_rf, r2_rf = calculate_metrics(y_test, y_pred_rf)

    metrics_dict_rf['Feature'].append(feature)
    metrics_dict_rf['MSE'].append(mse_rf)
    metrics_dict_rf['RMSE'].append(rmse_rf)
    metrics_dict_rf['R-squared'].append(r2_rf)

results_df_rf = pd.DataFrame(metrics_dict_rf)
print(results_df_rf)


# KNN

In [None]:
def calculate_metrics(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    return mse, rmse, r2

X = data[['GOLD', 'PETROL', 'CURRENCY']]
y = data['RELIANCE']

metrics_dict_knn = {'Feature': [], 'MSE': [], 'RMSE': [], 'R-squared': []}

for feature in X.columns:
    current_feature = X[[feature]]

    X_train, X_test, y_train, y_test = train_test_split(current_feature, y, test_size=0.2, random_state=42)

    model_knn = KNeighborsRegressor(n_neighbors=5)
    model_knn.fit(X_train, y_train)

    y_pred_knn = model_knn.predict(X_test)

    mse_knn, rmse_knn, r2_knn = calculate_metrics(y_test, y_pred_knn)

    metrics_dict_knn['Feature'].append(feature)
    metrics_dict_knn['MSE'].append(mse_knn)
    metrics_dict_knn['RMSE'].append(rmse_knn)
    metrics_dict_knn['R-squared'].append(r2_knn)

results_df_knn = pd.DataFrame(metrics_dict_knn)
print(results_df_knn)


# ANN

In [None]:
def calculate_metrics(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)
    return mse, rmse, r2

# Assuming data is your dataset
X = data[['GOLD', 'PETROL', 'CURRENCY']]
y = data['RELIANCE']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model_ann = Sequential()
model_ann.add(Dense(32, activation='relu', input_shape=(X_train.shape[1],)))
model_ann.add(Dense(16, activation='relu'))
model_ann.add(Dense(1, activation='linear'))

model_ann.compile(optimizer='adam', loss='mean_squared_error')

model_ann.fit(X_train_scaled, y_train, epochs=50, batch_size=32, verbose=0)

y_pred_ann = model_ann.predict(X_test_scaled).flatten()

mse_ann, rmse_ann, r2_ann = calculate_metrics(y_test, y_pred_ann)

# Create a DataFrame to display actual and predicted prices
results_ann = pd.DataFrame({'Actual Prices': y_test, 'Predicted Prices (ANN)': y_pred_ann})

# Display the DataFrame and metrics
print("Actual and Predicted Prices (ANN):")
print(results_ann.head())

print("\nPerformance Metrics (ANN):")
print(f'Mean Squared Error (ANN): {mse_ann}')
print(f'Root Mean Squared Error (ANN): {rmse_ann}')
print(f'R-squared (ANN): {r2_ann}')


# LSTM

In [None]:
target_variable = 'RELIANCE'
y = data[target_variable].values.reshape(-1, 1)

scaler = MinMaxScaler(feature_range=(0, 1))
y_scaled = scaler.fit_transform(y)

def create_sequences(data, seq_length):
    sequences = []
    targets = []
    for i in range(len(data) - seq_length):
        seq = data[i:i+seq_length]
        target = data[i+seq_length]
        sequences.append(seq)
        targets.append(target)
    return np.array(sequences), np.array(targets)

sequence_length = 10

X_seq, y_seq = create_sequences(y_scaled, sequence_length)

train_size = int(len(X_seq) * 0.7)
val_size = int(len(X_seq) * 0.15)
test_size = len(X_seq) - train_size - val_size

X_train, y_train = X_seq[:train_size], y_seq[:train_size]
X_val, y_val = X_seq[train_size:train_size+val_size], y_seq[train_size:train_size+val_size]
X_test, y_test = X_seq[train_size+val_size:], y_seq[train_size+val_size:]

model_lstm = Sequential()
model_lstm.add(LSTM(50, input_shape=(X_train.shape[1], X_train.shape[2])))
model_lstm.add(Dense(1))
model_lstm.compile(optimizer='adam', loss='mse')

model_lstm.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val), verbose=1)

y_pred_lstm = model_lstm.predict(X_test)

y_pred_lstm_inv = scaler.inverse_transform(y_pred_lstm)
y_test_inv = scaler.inverse_transform(y_test)

rmse_lstm = np.sqrt(mean_squared_error(y_test_inv, y_pred_lstm_inv))

test_dates = data.index[train_size+val_size : train_size+val_size+len(y_test_inv)]

plt.figure(figsize=(12, 6))
plt.plot(test_dates, y_test_inv, label='Actual')
plt.plot(test_dates, y_pred_lstm_inv, label='LSTM Prediction')
plt.title(f'LSTM Prediction vs Actual (RMSE: {rmse_lstm:.2f})')
plt.xlabel('Date')  # Update xlabel to 'Date'
plt.ylabel(target_variable)
plt.legend()
plt.show()

error_percentage = (rmse_lstm / np.mean(y_test_inv)) * 100

print(f'Root Mean Squared Error (RMSE): {rmse_lstm:.2f}')
print(f'Error Percentage: {error_percentage:.2f}%')

forecasted_values_scaled = []
for i in range(7):
    X_new = np.array([X_test[i]])
    forecasted_value_scaled = model_lstm.predict(X_new)[0][0]
    forecasted_values_scaled.append(forecasted_value_scaled)
    X_test = np.concatenate((X_test, X_new), axis=0)

forecasted_values = scaler.inverse_transform(np.array(forecasted_values_scaled).reshape(-1, 1))
forecasted_dates = pd.date_range(data.index[-1], periods=7, freq='D')[1:]

print("Forecasted stock prices for the upcoming 7 days:")
for date, price in zip(forecasted_dates, forecasted_values):
    print(f"{date.strftime('%Y-%m-%d')}: {price[0]}")




# All models together

In [None]:
def calculate_metrics(y_test, y_pred):
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    return mse, rmse, mae

X = data[['GOLD', 'PETROL', 'CURRENCY']]
y = data['RELIANCE']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=40)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

scaler_ann = StandardScaler()
X_train_scaled_ann = scaler_ann.fit_transform(X_train)
X_test_scaled_ann = scaler_ann.transform(X_test)
X_val_scaled_ann = scaler_ann.transform(X_val)

scaler_lstm = MinMaxScaler(feature_range=(0, 1))
y_train_scaled_lstm = scaler_lstm.fit_transform(y_train.values.reshape(-1, 1))
y_test_scaled_lstm = scaler_lstm.transform(y_test.values.reshape(-1, 1))

model_lr = LinearRegression()
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_test)
mse_lr, rmse_lr, mae_lr = calculate_metrics(y_test, y_pred_lr)
error_percentage_lr = (mae_lr / y_test.mean()) * 100

model_rf = RandomForestRegressor(random_state=42)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)
mse_rf, rmse_rf, mae_rf = calculate_metrics(y_test, y_pred_rf)
error_percentage_rf = (mae_rf / y_test.mean()) * 100

metrics_dict_knn = {'Feature': [], 'MSE': [], 'RMSE': [], 'MAE': []}
for feature in X.columns:
    current_feature = X_train[[feature]]
    model_knn = KNeighborsRegressor(n_neighbors=5)
    model_knn.fit(current_feature, y_train)
    current_feature_test = X_test[[feature]]
    y_pred_knn = model_knn.predict(current_feature_test)
    mse_knn, rmse_knn, mae_knn = calculate_metrics(y_test, y_pred_knn)
    metrics_dict_knn['Feature'].append(feature)
    metrics_dict_knn['MSE'].append(mse_knn)
    metrics_dict_knn['RMSE'].append(rmse_knn)
    metrics_dict_knn['MAE'].append(mae_knn)

metrics_df_knn = pd.DataFrame(metrics_dict_knn)
error_percentage_knn = (metrics_df_knn['MAE'].mean() / y_test.mean()) * 100

model_ann = Sequential()
model_ann.add(Dense(32, activation='relu', input_shape=(X_train.shape[1],)))
model_ann.add(Dense(16, activation='relu'))
model_ann.add(Dense(1, activation='linear'))
model_ann.compile(optimizer='adam', loss='mean_squared_error')
model_ann.fit(X_train_scaled_ann, y_train, epochs=50, batch_size=32, verbose=0)
y_pred_ann = model_ann.predict(X_test_scaled_ann).flatten()
mse_ann, rmse_ann, mae_ann = calculate_metrics(y_test, y_pred_ann)
error_percentage_ann = (mae_ann / y_test.mean()) * 100

sequence_length_lstm = 10
X_seq_lstm, y_seq_lstm = create_sequences(y_train_scaled_lstm, sequence_length_lstm)

X_seq_val_lstm, y_seq_val_lstm = create_sequences(scaler_lstm.transform(y_val.values.reshape(-1, 1)), sequence_length_lstm)
X_seq_test_lstm, y_seq_test_lstm = create_sequences(y_test_scaled_lstm, sequence_length_lstm)

model_lstm = Sequential()
model_lstm.add(LSTM(50, input_shape=(X_seq_lstm.shape[1], X_seq_lstm.shape[2])))
model_lstm.add(Dense(1))
model_lstm.compile(optimizer='adam', loss='mse')
model_lstm.fit(X_seq_lstm, y_seq_lstm, epochs=50, batch_size=32, validation_data=(X_seq_val_lstm, y_seq_val_lstm), verbose=0)
y_pred_lstm = model_lstm.predict(X_seq_test_lstm)
y_pred_lstm_inv = scaler_lstm.inverse_transform(y_pred_lstm.reshape(-1, 1)).flatten()
y_test_inv_lstm = scaler_lstm.inverse_transform(y_seq_test_lstm.reshape(-1, 1)).flatten()
rmse_lstm = np.sqrt(mean_squared_error(y_test_inv_lstm, y_pred_lstm_inv))

results_lr = pd.DataFrame({'Actual': y_test, 'Predicted_LR': y_pred_lr})
results_rf = pd.DataFrame({'Actual': y_test, 'Predicted_RF': y_pred_rf})
results_knn = pd.DataFrame({'Actual': y_test, 'Predicted_KNN': y_pred_knn})
results_ann = pd.DataFrame({'Actual': y_test, 'Predicted_ANN': y_pred_ann})
results_lstm = pd.DataFrame({'Actual': y_test_inv_lstm, 'Predicted_LSTM': y_pred_lstm_inv})

error_df = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest', 'k-Nearest Neighbors', 'Artificial Neural Network', 'LSTM'],
    'MSE': [mse_lr, mse_rf, metrics_df_knn['MSE'].mean(), mse_ann, mean_squared_error(y_test_inv_lstm, y_pred_lstm_inv)],
    'RMSE': [rmse_lr, rmse_rf, metrics_df_knn['RMSE'].mean(), rmse_ann, rmse_lstm],
    'MAE': [mae_lr, mae_rf, metrics_df_knn['MAE'].mean(), mae_ann, mean_absolute_error(y_test_inv_lstm, y_pred_lstm_inv)],
    'Error Percentage': [error_percentage_lr, error_percentage_rf, error_percentage_knn, error_percentage_ann, (mean_absolute_error(y_test_inv_lstm, y_pred_lstm_inv) / y_test_inv_lstm.mean()) * 100]
})

print("Results for Linear Regression:")
print(results_lr.head())
print("\nResults for Random Forest:")
print(results_rf.head())
print("\nResults for k-Nearest Neighbors:")
print(results_knn.head())
print("\nResults for Artificial Neural Network:")
print(results_ann.head())
print("\nResults for LSTM:")
print(results_lstm.head())

print("\nError Table:")
print(error_df)

error_df.to_csv('regression_results.csv', index=False)


# SENTIMENT ANALYSIS

In [None]:
nltk.download('vader_lexicon')

df = pd.read_csv('sentiment.csv')
df['Headline'].fillna('', inplace=True)
df['Content'].fillna('', inplace=True)

sid = SentimentIntensityAnalyzer()
df['Headline_Sentiment'] = df['Headline'].apply(lambda x: sid.polarity_scores(str(x))['compound'])
df['Content_Sentiment'] = df['Content'].apply(lambda x: sid.polarity_scores(str(x))['compound'])

df.to_csv('sentiment_with_scores.csv', index=False)

X = df[['Headline_Sentiment', 'Content_Sentiment']]
y = df['Close']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

model = RandomForestRegressor()
model.fit(X_train, y_train)

test_preds = model.predict(X_test)

y_test = y_test.reset_index(drop=True)
test_preds = pd.Series(test_preds, name='Predicted')

mse_test = mean_squared_error(y_test, test_preds)
rmse_test = sqrt(mse_test)
mae_test = mean_absolute_error(y_test, test_preds)

range_close = y.max() - y.min()
mse_percentage = (mse_test / range_close) * 100
rmse_percentage = (rmse_test / range_close) * 100
mae_percentage = (mae_test / range_close) * 100

table = [['Metric', 'Error', 'Error Percentage'],
         ['MSE', mse_test, mse_percentage],
         ['RMSE', rmse_test, rmse_percentage],
         ['MAE', mae_test, mae_percentage]]

print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

plt.figure(figsize=(12, 6))
plt.plot(y_test.index, y_test, label='Actual Close Prices', color='blue')
plt.plot(y_test.index, test_preds, label='Predicted Close Prices', color='orange')
plt.xlabel('Index')
plt.ylabel('Close Prices')
plt.title('Actual vs. Predicted Close Prices')
plt.legend()

plt.show()
