# Stock Market Strategy Prediction

## Import Dataset

In [None]:
import numpy as np
import pandas as pd
train = pd.read_csv('/train.csv',index_col = 0)
test = pd.read_csv('/test.csv',index_col = 0)

## Data Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train.head()

In [None]:
df = test.copy()

In [None]:
print(df.isnull().sum())

In [None]:
df2 = train.copy()

In [None]:
print(df2.isnull().sum())

In [None]:
print(df.dtypes)
print(df2.dtypes)

In [None]:
plt.hist(df['Open'])
plt.show()

In [None]:
plt.hist(df['Volume'])
plt.show()

In [None]:
plt.hist(df2['Open'])
plt.show()

In [None]:
plt.hist(df2['Volume'])
plt.show()

In [None]:
print(df2['Open'].describe())
print(df2['Volume'].describe())
print(df2['Close'].describe())

In [None]:
print(df2['Open'].corr(df2['Close']))
print(df2['Open'].corr(df2['Volume']))
print(df2['Open'].corr(df2['Volume']))

In [None]:
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

## Autocorrelation Plot

In [None]:
from pandas.plotting import autocorrelation_plot
autocorrelation_plot(df2['Close'])
autocorrelation_plot(df2['Open'])
autocorrelation_plot(df2['Volume'])
plt.show()

## Scatter Matrix

In [None]:
from pandas.plotting import scatter_matrix
scatter_matrix(df2[['Open' , 'Close' , 'Volume']], alpha=0.2, figsize=(6, 6))
plt.show()

print(df2['Close'].skew())
print(df2['Close'].kurtosis())

In [None]:
print(df2['Open'].skew())
print(df2['Open'].kurtosis())

In [None]:
print(df2['Volume'].skew())
print(df2['Volume'].kurtosis())

## ACF and PCF Plots

In [None]:
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Load your time series data into a DataFrame
# Assuming 'data' is your time series data
data = pd.read_csv('/kaggle/input/dataset/train.csv')

# Ensure the data is properly formatted with a datetime index
data['Date'] = pd.to_datetime(data['Date'])
data = data.set_index('Date')

# Select the numerical column you want to analyze
numerical_column = data['Close']

# Plot ACF (Autocorrelation Function)
plt.figure(figsize=(12, 10))
plot_acf(numerical_column, lags=40)  # Adjust 'lags' as needed
plt.title('Autocorrelation Function (ACF)')
plt.show()

# Plot PACF (Partial Autocorrelation Function)
plt.figure(figsize=(12, 6))
plot_pacf(numerical_column, lags=20)  # Adjust 'lags' as needed
plt.title('Partial Autocorrelation Function (PACF)')
plt.show()


In [None]:
numerical_column = data['Open']

# Plot ACF (Autocorrelation Function)
plt.figure(figsize=(12, 6))
plot_acf(numerical_column, lags=40)  # Adjust 'lags' as needed
plt.title('Autocorrelation Function (ACF)')
plt.show()

# Plot PACF (Partial Autocorrelation Function)
plt.figure(figsize=(12, 6))
plot_pacf(numerical_column, lags=40)  # Adjust 'lags' as needed
plt.title('Partial Autocorrelation Function (PACF)')
plt.show()

## ADF Test

In [None]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller

data = pd.read_csv('/kaggle/input/dataset/train.csv')

# Ensure the data is properly formatted with a datetime index
data['Date'] = pd.to_datetime(data['Date'])
data = data.set_index('Date')

# Select the numerical column you want to test
numerical_column = data['Close']

# Perform the ADF test
result = adfuller(numerical_column)

# Extract the ADF test statistic and p-value
adf_statistic = result[0]
p_value = result[1]

# Display the results
print(f'ADF Statistic: {adf_statistic}')
print(f'p-value: {p_value}')

# Interpret the results
if p_value <= 0.05:  # Common significance level of 0.05
    print('Null hypothesis (non-stationary) rejected. Data is likely stationary.')
else:
    print('Null hypothesis not rejected. Data may be non-stationary.')


In [None]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller

data = pd.read_csv('/kaggle/input/dataset/train.csv')

# Ensure the data is properly formatted with a datetime index
data['Date'] = pd.to_datetime(data['Date'])
data = data.set_index('Date')

# Select the numerical column you want to test
numerical_column = data['Volume']

# Perform the ADF test
result = adfuller(numerical_column)

# Extract the ADF test statistic and p-value
adf_statistic = result[0]
p_value = result[1]

# Display the results
print(f'ADF Statistic: {adf_statistic}')
print(f'p-value: {p_value}')

# Interpret the results
if p_value <= 0.05:  # Common significance level of 0.05
    print('Null hypothesis (non-stationary) rejected. Data is likely stationary.')
else:
    print('Null hypothesis not rejected. Data may be non-stationary.')


## Model Building

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

train_data = pd.read_csv('train.csv')
train_data['Date'] = pd.to_datetime(train_data['Date'])
train_data['Month'] = train_data['Date'].dt.month
train_data['Day'] = train_data['Date'].dt.day
train_data['Day_of_Week'] = train_data['Date'].dt.dayofweek
train_data['Year'] = train_data['Date'].dt.year
from sklearn.preprocessing import MinMaxScaler
columns_to_normalize = ['Open', 'Volume']
scaler = MinMaxScaler()
train_set_normalized = train_data.copy()  # Create a copy of the dataset to avoid altering the original
normalized_values = scaler.fit_transform(train_set_normalized[columns_to_normalize])
normalized_columns = [col + '_normalized' for col in columns_to_normalize]
train_data[normalized_columns] = normalized_values
num_lags = 3
for i in range(1, num_lags + 1):
    train_data[f'Open_lag_{i}'] = train_data['Open'].shift(i)
    train_data[f'Volume_lag_{i}'] = train_data['Volume'].shift(i)
for i in range(1, num_lags + 1):
    train_data[f'Open_lag_{i}'].fillna(train_data[f'Open_lag_{i}'].mean(),inplace = True)
    train_data[f'Volume_lag_{i}'].fillna(train_data[f'Open_lag_{i}'].mean(),inplace = True)



window_size = 5
rolling_cols = ['Open', 'Volume']
train_data['Open_rolling_mean'] = train_set_normalized['Open'].rolling(window=window_size).mean()
train_data['Open_rolling_std'] = train_set_normalized['Open'].rolling(window=window_size).std()
train_data['Volume_rolling_mean'] = train_set_normalized['Volume'].rolling(window=window_size).mean()
train_data['Volume_rolling_std'] = train_set_normalized['Volume'].rolling(window=window_size).std()
train_data['Open_rolling_mean'].fillna(train_data['Open_rolling_mean'].mean(), inplace=True)
train_data['Open_rolling_std'].fillna(train_data['Open_rolling_std'].mean(), inplace=True)
train_data['Volume_rolling_mean'].fillna(train_data['Volume_rolling_mean'].mean(), inplace=True)
train_data['Volume_rolling_std'].fillna(train_data['Volume_rolling_std'].mean(), inplace=True)
train_data['Open_Volume_interaction'] = train_data['Open'] * train_data['Volume']
total_samples = len(train_data)
train_size = int(total_samples * 0.8)
train_set = train_data.iloc[:train_size]
test_set = train_data.iloc[train_size:]
p = 0
d = 1
q = 1

exog_vars = ['Open', 'Volume', 'Open_Volume_interaction', 'Open_rolling_mean', 'Open_rolling_std', 'Volume_rolling_mean', 'Volume_rolling_std', 'Open_lag_1', 'Volume_lag_1', 'Open_lag_2', 'Volume_lag_2', 'Open_lag_3', 'Volume_lag_3']  # List of exogenous variables

# Create and fit the ARIMAX model
arimax_model = SARIMAX(train_data['Close'], order=(p, d, q), exog=train_data[exog_vars])
arimax_model_fit = arimax_model.fit()

# Assuming 'test_set' is the DataFrame you want to check

# Forecast using the fitted model on the test data
forecast = arimax_model_fit.forecast(steps=len(test_set), exog=test_set[exog_vars]).tolist()

# Calculate Mean Squared Error (MSE)
test_actual = test_set['Close']
smape = np.mean(np.abs(forecast - test_actual) / (np.abs(forecast) + np.abs(test_actual)))
print(smape)

## Model Prediction for Closing Price

In [None]:
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.preprocessing import MinMaxScaler

# Read the test data
test_data = pd.read_csv('test.csv')

# Perform similar preprocessing as done for the training data
test_data['Date'] = pd.to_datetime(test_data['Date'])
test_data['Month'] = test_data['Date'].dt.month
test_data['Day'] = test_data['Date'].dt.day
test_data['Day_of_Week'] = test_data['Date'].dt.dayofweek
test_data['Year'] = test_data['Date'].dt.year

# Normalize columns
scaler = MinMaxScaler()
normalized_values = scaler.fit_transform(test_data[['Open', 'Volume']])
normalized_columns = ['Open_normalized', 'Volume_normalized']
test_data[normalized_columns] = normalized_values

# Lag features
num_lags = 3
for i in range(1, num_lags + 1):
    test_data[f'Open_lag_{i}'] = test_data['Open'].shift(i)
    test_data[f'Volume_lag_{i}'] = test_data['Volume'].shift(i)

for i in range(1, num_lags + 1):
    test_data[f'Open_lag_{i}'].fillna(test_data[f'Open_lag_{i}'].mean(), inplace=True)
    test_data[f'Volume_lag_{i}'].fillna(test_data[f'Open_lag_{i}'].mean(), inplace=True)

# Rolling statistics
window_size = 5
test_data['Open_rolling_mean'] = test_data['Open'].rolling(window=window_size).mean()
test_data['Open_rolling_std'] = test_data['Open'].rolling(window=window_size).std()
test_data['Volume_rolling_mean'] = test_data['Volume'].rolling(window=window_size).mean()
test_data['Volume_rolling_std'] = test_data['Volume'].rolling(window=window_size).std()

test_data['Open_rolling_mean'].fillna(test_data['Open_rolling_mean'].mean(), inplace=True)
test_data['Open_rolling_std'].fillna(test_data['Open_rolling_std'].mean(), inplace=True)
test_data['Volume_rolling_mean'].fillna(test_data['Volume_rolling_mean'].mean(), inplace=True)
test_data['Volume_rolling_std'].fillna(test_data['Volume_rolling_std'].mean(), inplace=True)

# Interaction feature
test_data['Open_Volume_interaction'] = test_data['Open'] * test_data['Volume']

# Features used for prediction
exog_vars = ['Open', 'Volume', 'Open_Volume_interaction', 'Open_rolling_mean', 'Open_rolling_std', 'Volume_rolling_mean', 'Volume_rolling_std', 'Open_lag_1', 'Volume_lag_1', 'Open_lag_2', 'Volume_lag_2', 'Open_lag_3', 'Volume_lag_3']

forecast = arimax_model_fit.forecast(steps=len(test_data), exog=test_data[exog_vars])

results = pd.DataFrame({
    'id': test_data['id'],
    'date': test_data['Date'],
    'Close': forecast.tolist()
})
close_values = forecast.tolist()


# Save the updated 'test.csv' with predicted 'Close' values to a new file