In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

import datetime
import yfinance as yf
import time
import stockstats
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import iplot, init_notebook_mode
from collections import OrderedDict
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix

In [9]:
from IPython.display import Markdown, display

In [10]:
def printmd(string):
    display(Markdown(string))
printmd('**bold**')

**bold**

In [7]:
#grouping by frequency 
def FrameChanging(df, x):
    t = df.groupby(pd.Grouper(freq=x)).agg({"open": "first", 
                                             "close": "last", 
                                             "low": "min", 
                                             "high": "max",
                                             "volume": "sum"})
    return t

In [5]:
#calculate Volume-Weighted Average Price
def vwap(df):
    q = df.volume.values
    p = df.close.values
    return df.assign(vwap=(p * q).cumsum() / q.cumsum())

In [6]:
df = pd.read_csv("Data/RELIANCE__EQ__NSE__NSE__MINUTE.csv")

In [7]:
#timestamp converted to datetime format
#datetime set as index of dataset
df['DateIndex'] = pd.to_datetime(df['timestamp'])
df = df.set_index('DateIndex')
df = df.drop(['timestamp'], axis=1)

In [8]:
df_1D = FrameChanging(df, '1D')

In [12]:
#finding lag k autocorrelation values where 0<=k<=10
#calculating k-moving average where k = 50,100,200
#calculating number of green and red candles 
#calculating pivot point and first, second and third degrees of support and resistance
#converting days of the week to categories
def PertimePreprocess(df):
    
    df1 = df.copy()
    df1['L1'] = df1['close'].shift(1)
    df1['RateofReturn'] = (df1['close']/df1['L1'])  -1
    df1['Return'] = (df1['close']-df1['L1'])
    
    df1 = df1.dropna()
    
    df1['MA100'] = df.rolling(window=100,min_periods=1 ).mean()['close']
    df1['MA200'] = df.rolling(window=200,min_periods=1 ).mean()['close']
    df1['MA50'] = df.rolling(window=50,min_periods=1 ).mean()['close']
    
    print("df1:shape1", df1.shape)
    
    df1['L2'] = df1['close'].shift(2)
    df1['L3'] = df1['close'].shift(3)
    df1['L4'] = df1['close'].shift(4)
    df1['L5'] = df1['close'].shift(5)
    df1['L6'] = df1['close'].shift(6)
    df1['L7'] = df1['close'].shift(7)
    df1['L8'] = df1['close'].shift(8)
    df1['L9'] = df1['close'].shift(9)
    df1['L10'] = df1['close'].shift(10)
    
    df1.loc[df1['Return']>=0, 'Result'] = '1'
    
    df1.loc[df1['Return']<0, 'Result'] = '0'
    
    df1['P_Result'] = df1['Result'].shift(-1)
    
    df1['P_Result']  = df1['P_Result'].astype('category')
    
#     print("df1:shape2", df1.shape)
    df1 = df1.dropna()
    df1 = df1.drop('Result', 1)
    
    
#     print("df1:shape4", df1.shape)
    
    pivot_point = (df1['high'] + df1['low'] + df1['close'])/3
    R1 = pd.Series(2 * pivot_point - df1['low'])
    S1 = pd.Series(2 * pivot_point - df1['high'])  
    R2 = pd.Series(pivot_point + df1['high'] - df1['low'])  
    S2 = pd.Series(pivot_point - df1['high'] + df1['low'])  
    R3 = pd.Series(df1['high'] + 2 * (pivot_point - df1['low']))  
    S3 = pd.Series(df1['low'] - 2 * (df1['high'] - pivot_point))  
    
#     print("df1:shape5", df1.shape)

    df1['PP'] = pivot_point
    df1['R1'] = R1
    df1['S1'] = S1
    df1['R2'] = R2
    df1['S2'] = S2
    df1['R3'] = R3
    df1['S3'] = S3
    
#     print("df1:shape6", df1.shape)
    
    df1['Day_of_week'] = df1.index.dayofweek
    df1['Day_of_week']  = df1['Day_of_week'].values.astype(str) 
    df1['Day_of_week']  = df1['Day_of_week'].astype('category')
    
#     print("df1:shape", df1.shape)
    
    return df1
   
    

In [10]:
df1 =  PertimePreprocess(df)

df1:shape1 (321214, 11)



In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.



In [11]:
df1

Unnamed: 0_level_0,open,high,low,close,volume,L1,RateofReturn,Return,MA100,MA200,...,L10,P_Result,PP,R1,S1,R2,S2,R3,S3,Day_of_week
DateIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-02 09:26:00+05:30,539.58,539.58,539.10,539.38,4538.0,539.50,-0.000222,-0.12,539.677500,539.677500,...,541.50,0,539.353333,539.606667,539.126667,539.833333,538.873333,540.086667,538.646667,0
2017-01-02 09:27:00+05:30,539.38,539.38,539.00,539.00,5036.0,539.38,-0.000705,-0.38,539.625385,539.625385,...,537.23,0,539.126667,539.253333,538.873333,539.506667,538.746667,539.633333,538.493333,0
2017-01-02 09:28:00+05:30,539.00,539.00,538.50,538.85,6842.0,539.00,-0.000278,-0.15,539.570000,539.570000,...,539.93,0,538.783333,539.066667,538.566667,539.283333,538.283333,539.566667,538.066667,0
2017-01-02 09:29:00+05:30,538.80,538.80,538.50,538.68,4688.0,538.85,-0.000315,-0.17,539.510667,539.510667,...,539.85,1,538.660000,538.820000,538.520000,538.960000,538.360000,539.120000,538.220000,0
2017-01-02 09:30:00+05:30,538.68,539.00,538.68,538.83,3924.0,538.68,0.000278,0.15,539.468125,539.468125,...,539.98,0,538.836667,538.993333,538.673333,539.156667,538.516667,539.313333,538.353333,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-26 15:24:00+05:30,1748.65,1749.00,1747.60,1747.85,43576.0,1748.45,-0.000343,-0.60,1730.666500,1726.545000,...,1741.45,1,1748.150000,1748.700000,1747.300000,1749.550000,1746.750000,1750.100000,1745.900000,4
2020-06-26 15:25:00+05:30,1748.00,1749.70,1747.35,1749.15,57832.0,1747.85,0.000744,1.30,1730.941500,1726.670500,...,1741.85,1,1748.733333,1750.116667,1747.766667,1751.083333,1746.383333,1752.466667,1745.416667,4
2020-06-26 15:26:00+05:30,1749.50,1750.10,1748.70,1750.00,116771.0,1749.15,0.000486,0.85,1731.219000,1726.802000,...,1743.55,0,1749.600000,1750.500000,1749.100000,1751.000000,1748.200000,1751.900000,1747.700000,4
2020-06-26 15:27:00+05:30,1749.85,1750.60,1749.20,1749.95,65124.0,1750.00,-0.000029,-0.05,1731.489000,1726.930500,...,1743.75,1,1749.916667,1750.633333,1749.233333,1751.316667,1748.516667,1752.033333,1747.833333,4


In [12]:
train_df, test_df = train_test_split(df1, test_size=0.9, random_state=0)

In [13]:
train_label = train_df['P_Result']
test_label = test_df['P_Result']

In [14]:
train_df = train_df.drop('P_Result', 1)
test_df = test_df.drop('P_Result', 1)


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.



In [15]:
train_df

Unnamed: 0_level_0,open,high,low,close,volume,L1,RateofReturn,Return,MA100,MA200,...,L9,L10,PP,R1,S1,R2,S2,R3,S3,Day_of_week
DateIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-07-16 12:32:00+05:30,1079.60,1080.50,1079.60,1080.30,27510.0,1079.60,0.000648,0.70,1079.3655,1082.27650,...,1078.45,1078.10,1080.133333,1080.666667,1079.766667,1081.033333,1079.233333,1081.566667,1078.866667,0
2017-04-27 12:52:00+05:30,703.33,703.80,703.25,703.68,6330.0,703.38,0.000427,0.30,703.0107,703.44425,...,704.25,704.05,703.576667,703.903333,703.353333,704.126667,703.026667,704.453333,702.803333,3
2019-04-09 10:53:00+05:30,1328.00,1328.30,1327.50,1327.95,10912.0,1328.40,-0.000339,-0.45,1333.0110,1330.82825,...,1328.65,1329.70,1327.916667,1328.333333,1327.533333,1328.716667,1327.116667,1329.133333,1326.733333,1
2018-12-12 11:59:00+05:30,1098.00,1098.00,1097.90,1098.00,5588.0,1098.00,0.000000,0.00,1098.7065,1097.52750,...,1096.90,1097.60,1097.966667,1098.033333,1097.933333,1098.066667,1097.866667,1098.133333,1097.833333,2
2019-07-12 12:18:00+05:30,1292.35,1292.35,1291.65,1291.90,7511.0,1292.25,-0.000271,-0.35,1294.9925,1293.43125,...,1292.65,1292.95,1291.966667,1292.283333,1291.583333,1292.666667,1291.266667,1292.983333,1290.883333,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-04-30 10:55:00+05:30,967.50,967.50,966.50,967.15,14894.0,967.50,-0.000362,-0.35,971.0690,985.17975,...,967.45,967.75,967.050000,967.600000,966.600000,968.050000,966.050000,968.600000,965.600000,0
2020-04-22 12:19:00+05:30,1334.20,1337.00,1333.50,1337.00,188940.0,1334.25,0.002061,2.75,1328.8710,1316.00775,...,1331.30,1331.95,1335.833333,1338.166667,1334.666667,1339.333333,1332.333333,1341.666667,1331.166667,2
2018-08-21 12:46:00+05:30,1249.60,1250.00,1249.60,1249.95,9505.0,1249.90,0.000040,0.05,1247.2360,1242.50950,...,1249.05,1249.80,1249.850000,1250.100000,1249.700000,1250.250000,1249.450000,1250.500000,1249.300000,1
2018-04-11 15:03:00+05:30,929.85,930.05,929.75,929.95,36588.0,929.85,0.000108,0.10,930.0720,929.61350,...,930.05,930.00,929.916667,930.083333,929.783333,930.216667,929.616667,930.383333,929.483333,2


In [16]:
train_df = train_df.drop('volume', 1)
test_df = test_df.drop('volume', 1)
list(train_df)


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.



['open',
 'high',
 'low',
 'close',
 'L1',
 'RateofReturn',
 'Return',
 'MA100',
 'MA200',
 'MA50',
 'L2',
 'L3',
 'L4',
 'L5',
 'L6',
 'L7',
 'L8',
 'L9',
 'L10',
 'PP',
 'R1',
 'S1',
 'R2',
 'S2',
 'R3',
 'S3',
 'Day_of_week']

In [17]:
#fit to the data you provide, returning a "best fit" hyperplane that divides, or categorizes, your data.
classifier = SVC(kernel ='linear')
classifier.fit(train_df, train_label)

SVC(kernel='linear')

In [18]:
#using model to predict labels in test data
y_predict = classifier.predict(test_df)

In [14]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [20]:
#finding metrics accuracy, precision, recall, f1-score for the model
print(accuracy_score(test_label, y_predict))
print(precision_score(test_label, y_predict, pos_label="0"))
print(recall_score(test_label, y_predict, pos_label="0"))  # fraction of positives events that we predicted correctly
print(f1_score(test_label, y_predict, pos_label="0"))

0.5332240221666442
0.528683711265697
0.12829394484149603
0.20648166116825153


In [21]:
list(df1)

['open',
 'high',
 'low',
 'close',
 'volume',
 'L1',
 'RateofReturn',
 'Return',
 'MA100',
 'MA200',
 'MA50',
 'L2',
 'L3',
 'L4',
 'L5',
 'L6',
 'L7',
 'L8',
 'L9',
 'L10',
 'P_Result',
 'PP',
 'R1',
 'S1',
 'R2',
 'S2',
 'R3',
 'S3',
 'Day_of_week']

In [22]:
pivot_point = (df1['high'] + df1['low'] + df1['close'])/3
R1 = pd.Series(2 * pivot_point - df1['low'])
S1 = pd.Series(2 * pivot_point - df1['high'])  
R2 = pd.Series(pivot_point + df1['high'] - df1['low'])  
S2 = pd.Series(pivot_point - df1['high'] + df1['low'])  
R3 = pd.Series(df1['high'] + 2 * (pivot_point - df1['low']))  
S3 = pd.Series(df1['low'] - 2 * (df1['high'] - pivot_point))  

df1['PP'] = pivot_point
df1['R1'] = R1
df1['S1'] = S1
df1['R2'] = R2
df1['S2'] = S2
df1['R3'] = R3
df1['S3'] = S3

In [23]:
df1['Day_of_week']  = df1['Day_of_week'].astype('category')

In [24]:
type(df1['Day_of_week'])

pandas.core.series.Series

In [25]:
df1

Unnamed: 0_level_0,open,high,low,close,volume,L1,RateofReturn,Return,MA100,MA200,...,L10,P_Result,PP,R1,S1,R2,S2,R3,S3,Day_of_week
DateIndex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-02 09:26:00+05:30,539.58,539.58,539.10,539.38,4538.0,539.50,-0.000222,-0.12,539.677500,539.677500,...,541.50,0,539.353333,539.606667,539.126667,539.833333,538.873333,540.086667,538.646667,0
2017-01-02 09:27:00+05:30,539.38,539.38,539.00,539.00,5036.0,539.38,-0.000705,-0.38,539.625385,539.625385,...,537.23,0,539.126667,539.253333,538.873333,539.506667,538.746667,539.633333,538.493333,0
2017-01-02 09:28:00+05:30,539.00,539.00,538.50,538.85,6842.0,539.00,-0.000278,-0.15,539.570000,539.570000,...,539.93,0,538.783333,539.066667,538.566667,539.283333,538.283333,539.566667,538.066667,0
2017-01-02 09:29:00+05:30,538.80,538.80,538.50,538.68,4688.0,538.85,-0.000315,-0.17,539.510667,539.510667,...,539.85,1,538.660000,538.820000,538.520000,538.960000,538.360000,539.120000,538.220000,0
2017-01-02 09:30:00+05:30,538.68,539.00,538.68,538.83,3924.0,538.68,0.000278,0.15,539.468125,539.468125,...,539.98,0,538.836667,538.993333,538.673333,539.156667,538.516667,539.313333,538.353333,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-26 15:24:00+05:30,1748.65,1749.00,1747.60,1747.85,43576.0,1748.45,-0.000343,-0.60,1730.666500,1726.545000,...,1741.45,1,1748.150000,1748.700000,1747.300000,1749.550000,1746.750000,1750.100000,1745.900000,4
2020-06-26 15:25:00+05:30,1748.00,1749.70,1747.35,1749.15,57832.0,1747.85,0.000744,1.30,1730.941500,1726.670500,...,1741.85,1,1748.733333,1750.116667,1747.766667,1751.083333,1746.383333,1752.466667,1745.416667,4
2020-06-26 15:26:00+05:30,1749.50,1750.10,1748.70,1750.00,116771.0,1749.15,0.000486,0.85,1731.219000,1726.802000,...,1743.55,0,1749.600000,1750.500000,1749.100000,1751.000000,1748.200000,1751.900000,1747.700000,4
2020-06-26 15:27:00+05:30,1749.85,1750.60,1749.20,1749.95,65124.0,1750.00,-0.000029,-0.05,1731.489000,1726.930500,...,1743.75,1,1749.916667,1750.633333,1749.233333,1751.316667,1748.516667,1752.033333,1747.833333,4


In [5]:
def Analyse(Name):
    df = pd.read_csv(Name)
    df['DateIndex'] = pd.to_datetime(df['timestamp'])
    df = df.set_index('DateIndex')
    df = df.drop(['timestamp'], axis=1)
    
#     print(df.shape)

#     df_60Min = FrameChanging(df, '60Min')
#     df_60Min['Date'] = df_60Min.index
    
    df_1D = FrameChanging(df, '1D')
#     df_1D['Date'] = df_1D.index
    
#     print(df_1D.shape)
    
# #     printmd('**1Hr Data Analysis**')
# #     PerTimeData(df_60Min)
    
    printmd('**1 Day Data Analysis**')
    df_1DM = PertimePreprocess(df_1D)
    
#     print(df_1DM.shape)
    
#     print(df_1DM.head())
    
#     print(list(df_1DM))
    
#     df_1DM = df_1DM.drop('Result', 1)
    
#     print(list(df_1DM))
    
    
    
#     print(df_1DM)

    df_1DM = df_1DM.dropna()
    
    train_df1, test_df1 = train_test_split(df_1DM, test_size=0.25, random_state=0)
    train_label1 = train_df1['P_Result']
    test_label1 = test_df1['P_Result']

    train_df1 = train_df1.drop('P_Result', 1)
    test_df1 = test_df1.drop('P_Result', 1)

    train_df1 = train_df1.drop('volume', 1)
    test_df1 = test_df1.drop('volume', 1)


    classifier1 = SVC(kernel ='linear')


    classifier1.fit(train_df1, train_label1)


    y_predict1 = classifier1.predict(test_df1)

    print("Accuracy", accuracy_score(test_label1, y_predict1))
    print("Precision", precision_score(test_label1, y_predict1, pos_label="0"))
    print("Recall",recall_score(test_label1, y_predict1, pos_label="0"))  # fraction of positives events that we predicted correctly
    print("F1-score",f1_score(test_label1, y_predict1, pos_label="0"))
    

In [15]:
Analyse("Data/HDFC__EQ__NSE__NSE__MINUTE.csv")

**1 Day Data Analysis**

df1:shape1 (651, 11)



In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.



Accuracy 0.49375
Precision 0.5217391304347826
Recall 0.42857142857142855
F1-score 0.47058823529411764
