### Import libraries

In [2]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings; warnings.filterwarnings("ignore")
from math import sqrt
from nsepy import get_history as gh
import datetime as dt

from sklearn.model_selection import train_test_split
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
#import optuna.integration.lightgbm as lgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
import xgboost as xgb

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from IPython.display import display
import gc

from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, confusion_matrix, mean_absolute_error

### Load data

In [3]:
supplemental_prices = pd.read_csv('supplemental_files/stock_prices.csv')
supplemental_sprices = pd.read_csv('supplemental_files/secondary_stock_prices.csv')
prices = pd.read_csv('train_files/stock_prices.csv')
stock_list = pd.read_csv('stock_list.csv')
sprices = pd.read_csv('train_files/secondary_stock_prices.csv')

In [None]:
display(supplemental_prices.info())
display(supplemental_prices['Date'].unique())
display(prices.info())
display(stock_list.info())
display(stock_list.head(2))

In [4]:
prices=prices.append(sprices,ignore_index=True)
prices=prices.append(supplemental_prices,ignore_index=True)
prices=prices.append(supplemental_sprices,ignore_index=True)
prices=prices.drop(['RowId','ExpectedDividend'],axis=1)
prices=prices.dropna()

In [None]:
plt.scatter(prices['Date'],prices['Target'])

In [None]:
prices

In [5]:
prices['Date'] = prices['Date'].str.replace('-','')
train_prices = prices[prices['Date']<'20220201']
train_prices = train_prices.drop(['Date'],axis=1)
test_prices = prices[prices['Date']>='20220201']
test_prices = test_prices.drop(['Date'],axis=1)

In [None]:
gc.collect()

In [6]:
y_train = train_prices.pop('Target')
y_test = test_prices.pop('Target')
X_train = train_prices
X_test = test_prices


In [None]:
def featuring(train):
    dfa=pd.DataFrame()
    for code in train['SecuritiesCode'].unique():
        df=train[train['SecuritiesCode']==code]
        df=df.sort_values(by=['Date'], ascending=True)
        df['RA_20'] = df.Close.rolling(5, min_periods=1).mean()
        df['RA_40'] = df.Close.rolling(10, min_periods=1).mean()
        df['RA_60'] = df.Close.rolling(15, min_periods=1).mean()
        df['RA_80'] = df.Close.rolling(20, min_periods=1).mean()
        dfa=dfa.append(df)
    dfa['Quarter'] = dfa['Date'].apply(lambda time: pd.Timestamp(time).quarter)
    dfa['year']=pd.to_numeric(dfa['Date'].str[0:4]).astype(float)
    dfa['month']=pd.to_numeric(dfa['Date'].str[5:7]).astype(float)
    dfa['day']=pd.to_numeric(dfa['Date'].str[8:10]).astype(float)
    dfa['delta']=pd.to_numeric(dfa['High']-dfa['Low']).astype(float)
    dfa['change']=pd.to_numeric(dfa['Close']-dfa['Open']).astype(float)
    dfa=dfa[['Date','SecuritiesCode','delta','change','RA_20','RA_40','RA_60','year','month','day','Quarter']]
    train=train.merge(dfa,how='left',on=['Date','SecuritiesCode'],suffixes=('', 'b')).set_axis(train.index)
    train=train.drop(['Date'],axis=1)
    #train=train.merge(stock_list, how='inner',on='SecuritiesCode',suffixes=('', 'b')).set_axis(train.index)
    #train=train.drop(['EffectiveDate','Name','33SectorName','17SectorName','NewIndexSeriesSize','TradeDate','Closeb'],axis=1)
    #dfa=dfa.join(stock_list,how='left',on='SecuritiesCode',rsuffix='b')
    #dfa=dfa.drop(['SecuritiesCodeb','Name', 'NewMarketSegment','33SectorCode','33SectorName','17SectorCode','17SectorName','NewIndexSeriesSizeCode', 'NewIndexSeriesSize',
    #   'TradeDate','Closeb','Universe0'],axis=1)
    #dfa['Section']=label_encoder.fit_transform(dfa['Section/Products'])
    #dfa=dfa.drop(['Section/Products'],axis=1)
    #dfa.sort_index(inplace=True)
    return train

In [None]:
X_train=featuring(X_train)
X_test=featuring(X_test)


## Preprocessing

In [7]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Linear Regression

In [8]:
## Train the model
model_linear = LinearRegression()
model_linear.fit(X_train, y_train)

## Predict in test data
y_pred_linear = model_linear.predict(X_test)

In [9]:
##Evaluate the model
print("Linear Regression")

#Mean Squared Error (MSE)
mse_linear = mean_squared_error(y_test, y_pred_linear)
#Mean Absolute Error (MAE)
mae_linear = mean_absolute_error(y_test, y_pred_linear)
#Mean Absolute Percentage Error (MAPE)
mape_linear = mean_absolute_percentage_error(y_test, y_pred_linear)
#Root Mean Squared Error (RMSE)
rmse_linear = mean_squared_error(y_test, y_pred_linear, squared = False)

print("Mean Squared Error: ", mse_linear)
print("Mean Absolute Error: ", mae_linear)
print("Mean Absolute Percentage Error: ", mape_linear)
print("Root Mean Squared Error: ", rmse_linear)

Linear Regression
Mean Squared Error:  0.000695751848338965
Mean Absolute Error:  0.016623766542252127
Mean Absolute Percentage Error:  89837688114.40349
Root Mean Squared Error:  0.02637710841504362


## k - Nearest Neighbor

In [10]:
## Train the model
model_knn = KNeighborsRegressor(n_neighbors = 50)
model_knn.fit(X_train, y_train)

## Predict in test data
y_pred_knn = model_knn.predict(X_test)

In [11]:
##Evaluate the model
print("k-Nearest Neighbor")

#Mean Squared Error (MSE)
mse_knn = mean_squared_error(y_test, y_pred_knn)
#Mean Absolute Error (MAE)
mae_knn = mean_absolute_error(y_test, y_pred_knn)
#Mean Absolute Percentage Error (MAPE)
mape_knn = mean_absolute_percentage_error(y_test, y_pred_knn)
#Root Mean Squared Error (RMSE)
rmse_knn = mean_squared_error(y_test, y_pred_knn, squared = False)

print("Mean Squared Error: ", mse_knn)
print("Mean Absolute Error: ", mae_knn)
print("Mean Absolute Percentage Error: ", mape_knn)
print("Root Mean Squared Error: ", rmse_knn)

k-Nearest Neighbor
Mean Squared Error:  0.0007095019648995752
Mean Absolute Error:  0.01695586332460318
Mean Absolute Percentage Error:  727315081942.3413
Root Mean Squared Error:  0.026636478087381885


## Support Vector Regressor

In [None]:
model_svr = SVR(kernel='linear') # Linear Kernel
#Train the model using the training sets
model_svr = model_svr.fit(X_train, y_train)
#Predict the response for test dataset
y_pred_svr = model_svr.predict(X_test)

In [None]:
##Evaluate the model
print("Support Vector Regressor")

#Mean Squared Error (MSE)
mse_svr = mean_squared_error(y_test, y_pred_svr)
#Mean Absolute Error (MAE)
mae_svr = mean_absolute_error(y_test, y_pred_svr)
#Mean Absolute Percentage Error (MAPE)
mape_svr = mean_absolute_percentage_error(y_test, y_pred_svr)
#Root Mean Squared Error (RMSE)
rmse_svr = mean_squared_error(y_test, y_pred_svr, squared = False)

print("Mean Squared Error: ", mse_svr)
print("Mean Absolute Error: ", mae_svr)
print("Mean Absolute Percentage Error: ", mape_svr)
print("Root Mean Squared Error: ", rmse_svr)

## AdaBoost

In [None]:
## Train the model
model_adaboost = AdaBoostRegressor(n_estimators = 100, learning_rate = 3, random_state = 0)
model_adaboost = model_adaboost.fit(X_train, y_train)

## Predict in test data
y_pred_adaboost = model_adaboost.predict(X_test)

In [None]:
##Evaluate the model

#Mean Squared Error (MSE)
mse_adaboost = mean_squared_error(y_test, y_pred_adaboost)
#Mean Absolute Error (MAE)
mae_adaboost = mean_absolute_error(y_test, y_pred_adaboost)
#Mean Absolute Percentage Error (MAPE)
mape_adaboost = mean_absolute_percentage_error(y_test, y_pred_adaboost)
#Root Mean Squared Error (RMSE)
rmse_adaboost = mean_squared_error(y_test, y_pred_adaboost, squared = False)

print("Mean Squared Error: ", mse_adaboost)
print("Mean Absolute Error: ", mae_adaboost)
print("Mean Absolute Percentage Error: ", mape_adaboost)
print("Root Mean Squared Error: ", rmse_adaboost)

## Random Forest

In [None]:
## Train the model
model_random_forest = RandomForestRegressor(n_estimators = 1000, random_state = 42)
model_random_forest = model_random_forest.fit(X_train, y_train);

## Predict in test data
y_pred_forest = model_random_forest.predict(X_test)

In [None]:
##Evaluate the model

#Mean Squared Error (MSE)
mse_random_forest = mean_squared_error(y_test, y_pred_forest)
#Mean Absolute Error (MAE)
mae_random_forest = mean_absolute_error(y_test, y_pred_forest)
#Mean Absolute Percentage Error (MAPE)
mape_random_forest = mean_absolute_percentage_error(y_test, y_pred_forest)
#Root Mean Squared Error (RMSE)
rmse_random_forest = mean_squared_error(y_test, y_pred_forest, squared = False)

print("Mean Squared Error: ", mse_random_forest)
print("Mean Absolute Error: ", mae_random_forest)
print("Mean Absolute Percentage Error: ", mape_random_forest)
print("Root Mean Squared Error: ", rmse_random_forest)

## XGBoost

In [None]:
## Train the model
model_xgboost = xgb.XGBRegressor(n_estimators=800,
                                 max_depth=16,
                                 learning_rate=0.01,
                                 subsample=0.5,
                                 colsample_bytree=0.75,
                                 missing=-999,
                                 random_state=2020,
                                 tree_method='gpu_hist')

model_xgboost.fit(X_train, y_train, early_stopping_rounds=20, eval_set=[(X_test, y_test)], verbose=1)

## Predict in test data
y_pred_xgboost = model_xgboost.predict(X_test)

In [None]:
##Evaluate the model

#Mean Squared Error (MSE)
mse_xgboost = mean_squared_error(y_test, y_pred_xgboost)
#Mean Absolute Error (MAE)
mae_xgboost = mean_absolute_error(y_test, y_pred_xgboost)
#Mean Absolute Percentage Error (MAPE)
mape_xgboost = mean_absolute_percentage_error(y_test, y_pred_xgboost)
#Root Mean Squared Error (RMSE)
rmse_xgboost = mean_squared_error(y_test, y_pred_xgboost, squared = False)

print("Mean Squared Error: ", mse_xgboost)
print("Mean Absolute Error: ", mae_xgboost)
print("Mean Absolute Percentage Error: ", mape_xgboost)
print("Root Mean Squared Error: ", rmse_xgboost)

In [1]:


import numpy as np
import pandas as pd


def calc_spread_return_sharpe(df: pd.DataFrame, portfolio_size: int = 200, toprank_weight_ratio: float = 2) -> float:
    """
    Args:
        df (pd.DataFrame): predicted results
        portfolio_size (int): # of equities to buy/sell
        toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
    Returns:
        (float): sharpe ratio
    """
    def _calc_spread_return_per_day(df, portfolio_size, toprank_weight_ratio):
        """
        Args:
            df (pd.DataFrame): predicted results
            portfolio_size (int): # of equities to buy/sell
            toprank_weight_ratio (float): the relative weight of the most highly ranked stock compared to the least.
        Returns:
            (float): spread return
        """
        assert df['Rank'].min() == 0
        assert df['Rank'].max() == len(df['Rank']) - 1
        weights = np.linspace(start=toprank_weight_ratio, stop=1, num=portfolio_size)
        purchase = (df.sort_values(by='Rank')['Target'][:portfolio_size] * weights).sum() / weights.mean()
        short = (df.sort_values(by='Rank', ascending=False)['Target'][:portfolio_size] * weights).sum() / weights.mean()
        return purchase - short

    buf = df.groupby('Date').apply(_calc_spread_return_per_day, portfolio_size, toprank_weight_ratio)
    sharpe_ratio = buf.mean() / buf.std()
    return sharpe_ratio



## Long Short-Term Memory (LSTM)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, Bidirectional

In [None]:
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape = (train_seq.shape[1], train_seq.shape[2])))

model.add(Dropout(0.1)) 
model.add(LSTM(units=50))

model.add(Dense(2))

model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_absolute_error'])

model.summary()


In [None]:
model.fit(X_train, y_train, epochs=80,validation_data=(X_test, y_test), verbose=1)
y_pred_lstm = model.predict(X_test)
test_inverse_predicted = MMS.inverse_transform(y_pred_lstm)

In [None]:
price = pd.read_csv('supplemental_files/stock_prices.csv', parse_dates=["Date"])
prices.drop(['RowId','ExpectedDividend'], axis=1,inplace = True)
df = df[df['Open'].notna()]
df = df[df['Target'].notna()]
df2 = df.copy()
df2['Date'] = df['Date'].dt.date
df2['date_delta'] = (df2['Date'] - df2['Date'].min())  / np.timedelta64(1,'D')
df2['weekday'] = df2['Date'].apply(lambda time: time.weekday())
df2['Quarter'] = df2['Date'].apply(lambda time: pd.Timestamp(time).quarter)
df2.drop(['Date'],axis=1,inplace = True)

In [None]:
train_set = data2.iloc[:, 1:2].values
sc = MinMaxScaler(feature_range = (0, 1))
training_set_scaled = sc.fit_transform(train_set)
X_train = []
y_train = []
for i in range(60, 1482):
    X_train.append(training_set_scaled[i-60:i, 0])
    y_train.append(training_set_scaled[i, 0]) 
X_train, y_train = np.array(X_train), np.array(y_train)
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
regressor = Sequential()
regressor.add(LSTM(units = 50, return_sequences = True, input_shape = (X_train.shape[1], 1)))
regressor.add(Dropout(0.2))
regressor.add(LSTM(units = 50, return_sequences = True))
regressor.add(Dropout(0.2))
regressor.add(LSTM(units = 50, return_sequences = True))
regressor.add(Dropout(0.2))
regressor.add(LSTM(units = 50))
regressor.add(Dropout(0.2))
regressor.add(Dense(units = 1))
regressor.compile(optimizer = 'adam', loss = 'mean_squared_error')
regressor.fit(X_train, y_train, epochs = 15, batch_size = 32)

In [None]:
train_set = data2.iloc[:, 1:2].values

In [None]:
train_set

In [None]:
data2

In [None]:
start = dt.datetime(2013,1,1)
end = dt.datetime(2018,12,31)
stk_data = gh(symbol='SBIN',start=start,end=end)
stk_data['Date'] = stk_data.index
data2 = pd.DataFrame(columns = ['Date', 'Open', 'High', 'Low', 'Close'])
data2['Date'] = stk_data['Date']
data2['Open'] = stk_data['Open']
data2['High'] = stk_data['High']
data2['Low'] = stk_data['Low']
data2['Close'] = stk_data['Close']

In [None]:
plt.figure(figsize=(14,14))
plt.plot(stk_data['Close'])
plt.title('Historical Stock Value')
plt.xlabel('Date')
plt.ylabel('Stock Price')
plt.show()

In [None]:
stk_data

## Convolutional Neural Networks (CNN)