In [2]:
# Data manipulation libraries
import pandas as pd
import numpy as np

# Machine learning libraries
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix, recall_score, precision_score, classification_report

# Technical indicator library
import talib as ta

# Data import library
import yfinance as yf

#Data visualisation
import plotly.io as pio
pio.renderers.default = 'iframe'# 'iframe' # or 'notebook' or 'colab' or 'jupyterlab'
import plotly.graph_objs as go

import warnings
pd.set_option('display.max_columns',100)
warnings.filterwarnings("ignore")

### Loading Data from yahoo

In [3]:
# df = yf.download('TSLA', start="2021-03-24", end="2021-03-25", period = '1d', interval = '1m')
df = yf.download('TSLA', start='2022-03-29', end='2022-04-01', period = '1d', interval = '1m')
df = df[df.index<'2022-03-30 09:30:00-04:00']
# Drop the rows with zero volume traded
df = df.drop(df[df['Volume'] == 0].index)
df

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-03-29 09:30:00-04:00,1109.260010,1109.260010,1109.260010,1109.260010,1109.260010,2156753
2022-03-29 09:31:00-04:00,1109.260010,1113.839966,1107.969971,1112.270020,1112.270020,276621
2022-03-29 09:32:00-04:00,1112.814941,1114.770020,1110.030029,1111.949951,1111.949951,192940
2022-03-29 09:33:00-04:00,1111.763306,1113.000000,1107.239990,1108.880005,1108.880005,226432
2022-03-29 09:34:00-04:00,1108.000000,1111.079956,1107.760132,1108.400024,1108.400024,126640
...,...,...,...,...,...,...
2022-03-29 15:55:00-04:00,1095.765015,1096.800049,1094.790039,1095.500000,1095.500000,77988
2022-03-29 15:56:00-04:00,1095.469971,1096.579956,1095.050049,1096.579956,1096.579956,64633
2022-03-29 15:57:00-04:00,1096.479980,1098.380005,1096.000000,1098.284546,1098.284546,107027
2022-03-29 15:58:00-04:00,1098.319946,1098.799927,1098.000000,1098.290039,1098.290039,107709


### RSI (Relative Strength Index) is a well-known trading indicator based on momentum strategy.

In [4]:
# Create a variable n with a value of 10
n = 10
# Create a column by name, RSI and assign the calculation of RSI to it
df['RSI'] = ta.RSI(np.array(df['Close'].shift(1)), timeperiod=n)

In [6]:
from plotly.subplots import make_subplots

fig = make_subplots(rows=2, cols=1)
fig.add_trace(go.Line(x=df.index, y=df.Open, name = 'Price'), row =1, col=1)
fig.add_trace(go.Line(x=df.index, y=df.RSI, name = 'RSI'), row =2, col=1)
fig.update_layout(title='Relative Strength Index and Stock Price for TESLA', yaxis_title='Stock Price (USD per Shares)')
fig.show()

### Add trending indicators to help the machine find relative Patterns
    Before starting indexing our values in percentage of change, we will add three different trading indicators to give more information about market trends and market volatility.
    The indicators that we will add are moving averages called MACD by traders, parabolic SAR to determine trend direction and potential reversals in price and the average directional index (ADX) to determine the strength of our trends.

In [7]:
# Create a column by name, SMA and assign the SMA calculation to it
df['SMA'] = df['Close'].shift(1).rolling(window=n).mean()
# Create a column by name, Corr and assign the calculation of correlation to it
df['Corr'] = df['Close'].shift(1).rolling(window=n).corr(df['SMA'].shift(1))
# Create a column by name, SAR and assign the SAR calculation to it
df['SAR'] = ta.SAR(np.array(df['High'].shift(1)), np.array(df['Low'].shift(1)), 0.2, 0.2)
# Create a column by name, ADX and assign the ADX calculation to it
df['ADX'] = ta.ADX(np.array(df['High'].shift(1)), np.array(df['Low'].shift(1)),
                   np.array(df['Open']), timeperiod=n)
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,RSI,SMA,Corr,SAR,ADX
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2022-03-29 09:30:00-04:00,1109.260010,1109.260010,1109.260010,1109.260010,1109.260010,2156753,,,,,
2022-03-29 09:31:00-04:00,1109.260010,1113.839966,1107.969971,1112.270020,1112.270020,276621,,,,,
2022-03-29 09:32:00-04:00,1112.814941,1114.770020,1110.030029,1111.949951,1111.949951,192940,,,,1113.839966,
2022-03-29 09:33:00-04:00,1111.763306,1113.000000,1107.239990,1108.880005,1108.880005,226432,,,,1107.969971,
2022-03-29 09:34:00-04:00,1108.000000,1111.079956,1107.760132,1108.400024,1108.400024,126640,,,,1114.770020,
...,...,...,...,...,...,...,...,...,...,...,...
2022-03-29 15:55:00-04:00,1095.765015,1096.800049,1094.790039,1095.500000,1095.500000,77988,30.917000,1098.523682,0.895191,1098.744009,29.587511
2022-03-29 15:56:00-04:00,1095.469971,1096.579956,1095.050049,1096.579956,1096.579956,64633,29.151724,1098.041687,0.889366,1097.853215,32.531470
2022-03-29 15:57:00-04:00,1096.479980,1098.380005,1096.000000,1098.284546,1098.284546,107027,38.698467,1097.620679,0.773317,1097.140580,35.181034
2022-03-29 15:58:00-04:00,1098.319946,1098.799927,1098.000000,1098.290039,1098.290039,107709,50.416057,1097.449121,0.439280,1094.290039,33.769663


### Adding Previous minutes values

In [8]:
# Create columns high, low and close with previous minute's OHLC data
df['Prev_High'] = df['High'].shift(1)
df['Prev_Low'] = df['Low'].shift(1)
df['Prev_Close'] = df['Close'].shift(1)

In [9]:
# Create columns 'OO' with the difference between the current minute's open and last minute's open
df['OO'] = df['Open']-df['Open'].shift(1)
# Create columns 'OC' with the difference between the current minute's open and last minute's close
df['OC'] = df['Open']-df['Prev_Close']

    Adding the percentage of change that will help us detect a bear to a bull period. 
    Indeed a percentage of change will be more beneficial for the algorithm to judge the movement. 
    And it will permit us to segregate our values between high profitable period vs range period vs loss period.

### Calculate the return
    calculate the returns (mathematically speaking, the percentage of change evocated above) for every data point(rows). We also save returns of past n minutes in n columns named as return1, return2 and so on. This will help the algorithm to understand the trend of the returns in the last n periods.

In [10]:
# Create a column 'Ret' with the calculation of returns
df['Ret'] = (df['Open'].shift(-1)-df['Open'])/df['Open']

# Create n columns and assign
for i in range(1, n):
    df['return%i' % i] = df['Ret'].shift(i)

In [11]:
# Additional Cleaning 
# Change the value of 'Corr' to -1 if it is less than -1
df.loc[df['Corr'] < -1, 'Corr'] = -1
# Change the value of 'Corr' to 1 if it is greater than 1
df.loc[df['Corr'] > 1, 'Corr'] = 1
# Drop the NaN values
df = df.dropna()
df.isna().sum().sum()

0

### Spliting Data for Training and Testing Sets 80/20(%)

In [12]:
# Create a variable split that stores 80% of the length of the dataframe
t = .8
split = int(t*len(df))
split

295

### Define output signal
    The output signal will be based on the percentage return and split into three categories:
    -Bear period: Output signal = -1
    -Range period: Output signal = 0
    -Bull period: Output signal = 1

In [13]:
# Create a column by name, 'Signal' and initialize with 0
df['Signal'] = 0
# Assign a value of 1 to 'Signal' column for the quantile with the highest returns
df.loc[df['Ret'] > df['Ret'][:split].quantile(q=0.66), 'Signal'] = 1
# Assign a value of -1 to 'Signal' column for the quantile with the lowest returns
df.loc[df['Ret'] < df['Ret'][:split].quantile(q=0.34), 'Signal'] = -1

In [14]:
df.shape

(369, 27)

In [20]:
df.to_csv('data/data_tesla_modeling_3_29.csv')

In [18]:
df

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,RSI,SMA,Corr,SAR,ADX,Prev_High,Prev_Low,Prev_Close,OO,OC,Ret,return1,return2,return3,return4,return5,return6,return7,return8,return9,Signal
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
2022-03-29 09:50:00-04:00,1088.000000,1089.689941,1087.719971,1089.339966,1089.339966,146662,25.544758,1096.660742,0.093291,1100.199951,20.764101,1088.689209,1088.430054,1088.430054,-0.689209,-0.430054,0.001379,-0.000633,-0.006897,-0.003409,0.002735,0.000000,0.000912,-0.001594,0.000684,0.000912,1
2022-03-29 09:51:00-04:00,1089.500000,1092.659302,1089.229980,1090.550049,1090.550049,126722,28.132768,1096.172742,0.486579,1097.845972,21.807815,1089.689941,1087.719971,1089.339966,1.500000,0.160034,-0.000459,0.001379,-0.000633,-0.006897,-0.003409,0.002735,0.000000,0.000912,-0.001594,0.000684,-1
2022-03-29 09:52:00-04:00,1089.000000,1089.939941,1088.510010,1089.375000,1089.375000,93541,31.643700,1095.560449,0.673921,1095.820771,21.116084,1092.659302,1089.229980,1090.550049,-0.500000,-1.550049,0.000459,-0.000459,0.001379,-0.000633,-0.006897,-0.003409,0.002735,0.000000,0.000912,-0.001594,0
2022-03-29 09:53:00-04:00,1089.500000,1091.598389,1089.500000,1091.149658,1091.149658,89891,30.059287,1094.546509,0.716484,1094.200611,20.769340,1089.939941,1088.510010,1089.375000,0.500000,0.125000,0.000459,0.000459,-0.000459,0.001379,-0.000633,-0.006897,-0.003409,0.002735,0.000000,0.000912,0
2022-03-29 09:54:00-04:00,1090.000000,1091.469971,1088.000000,1091.145020,1091.145020,93901,35.480407,1093.891479,0.719799,1092.904483,19.556190,1091.598389,1089.500000,1091.149658,0.500000,-1.149658,0.001376,0.000459,0.000459,-0.000459,0.001379,-0.000633,-0.006897,-0.003409,0.002735,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-03-29 15:54:00-04:00,1096.119995,1097.000000,1094.290039,1095.939941,1095.939941,86422,31.542320,1098.865686,0.794745,1099.600024,26.316444,1097.457764,1095.319946,1096.099976,-1.219971,0.020020,-0.000324,-0.001112,0.000666,-0.002277,-0.000343,-0.000300,-0.000164,-0.000527,0.000432,0.000869,0
2022-03-29 15:55:00-04:00,1095.765015,1096.800049,1094.790039,1095.500000,1095.500000,77988,30.917000,1098.523682,0.895191,1098.744009,29.587511,1097.000000,1094.290039,1095.939941,-0.354980,-0.174927,-0.000269,-0.000324,-0.001112,0.000666,-0.002277,-0.000343,-0.000300,-0.000164,-0.000527,0.000432,0
2022-03-29 15:56:00-04:00,1095.469971,1096.579956,1095.050049,1096.579956,1096.579956,64633,29.151724,1098.041687,0.889366,1097.853215,32.531470,1096.800049,1094.790039,1095.500000,-0.295044,-0.030029,0.000922,-0.000269,-0.000324,-0.001112,0.000666,-0.002277,-0.000343,-0.000300,-0.000164,-0.000527,1
2022-03-29 15:57:00-04:00,1096.479980,1098.380005,1096.000000,1098.284546,1098.284546,107027,38.698467,1097.620679,0.773317,1097.140580,35.181034,1096.579956,1095.050049,1096.579956,1.010010,-0.099976,0.001678,0.000922,-0.000269,-0.000324,-0.001112,0.000666,-0.002277,-0.000343,-0.000300,-0.000164,1
