### The goal of this script is to find the best combination of parameters to increase the accuracy of the model
#### To do so, we will compare different set of inputs passed in the multinomial naive bayes classifier from sklearn
#### https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB

In [2]:
#Import relevant libraries
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import itertools
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import balanced_accuracy_score

In [3]:
#Import the data
data = pd.read_csv("cleaned_data.csv")

In [4]:
#last cleanup
data = data[33:-2]
data = data.drop("Unnamed: 0", axis=1)
data

Unnamed: 0,Date,Close,Var. (%),Open,Low,High,Volume,Support,Resistance,Hammer,WilliamR,Williams%R,12EMA,26 EMA,MACD LINE,SIGNAL LINE,HISTOGRAM,ZeroCross,SignalCross,Decision
33,2016-03-15,6.61,-10.68,6.61,6.56,6.96,698485643,0,0,0,-0.470000,0,6.848171,6.000103,0.848068,0.782849,0.065219,1.0,1.0,1
34,2016-03-16,7.23,9.38,6.51,6.41,7.23,584312229,1,0,0,-0.290000,0,6.906914,6.091207,0.815708,0.789421,0.026287,1.0,1.0,0
35,2016-03-17,8.10,12.03,8.15,7.71,8.19,943356446,0,0,0,-0.040000,1,7.090466,6.240006,0.850460,0.801629,0.048831,1.0,1.0,2
36,2016-03-18,8.12,0.25,8.09,7.72,8.17,608847049,0,0,0,-0.030000,1,7.248856,6.379265,0.869591,0.815221,0.054370,1.0,1.0,0
37,2016-03-21,8.06,-0.74,8.08,7.93,8.27,526247185,0,0,0,-0.070000,1,7.373647,6.503764,0.869883,0.826154,0.043730,1.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1035,2020-04-01,14.30,2.22%,13.45,13.32,14.55,"113,48M",0,0,0,-0.241758,0,14.235086,17.109340,-2.874254,-3.433078,0.558825,0.0,1.0,0
1036,2020-04-02,15.51,8.46%,15.40,15.01,16.55,"233,81M",0,1,0,-0.182456,1,14.431227,16.990870,-2.559643,-3.258391,0.698748,0.0,1.0,0
1037,2020-04-03,15.34,-1.10%,16.30,14.93,16.36,"142,01M",0,0,0,-0.212281,0,14.571038,16.868583,-2.297545,-3.066222,0.768677,0.0,1.0,0
1038,2020-04-06,15.77,2.80%,16.05,15.18,16.10,"122,54M",0,0,0,-0.136842,1,14.755494,16.787207,-2.031713,-2.859320,0.827607,0.0,1.0,0


In [5]:
class Tester:

    def __init__(self,data, inputs,output,model):
        """
        data = data used
        inputs = inputs we want to test
        output = desired output
        efficiency = dictionary grouping the efficiency of each set of inputs
        model = model to be used
        """
        self.data = data
        self.inputs = inputs
        self.output = output
        self.efficiency = {}
        self.model = model
        
        
    def split_data(self,data,ratio,shuffle=True,random_state= 40):
        """
        data = data to be splitted
        ratio = cutting point between training data and testing data
        shuffle = indicator on whether or not the data should be shuffled
        random_state = state that determines the shuffling algorithm
        """
        train, test = train_test_split(data, test_size = ratio, random_state = random_state,shuffle = shuffle)
        X_test = test.iloc[:, :-1]
        X_train = train.iloc[:,:-1]
        Y_test = test.iloc[:, -1]
        Y_train = train.iloc[:,-1]
        return [X_train, Y_train, X_test,Y_test]
    
    
    def partition_inputs(self, num):
        """
        num = number of elements per partitions
        
        """
        partitions = []
        for comb in itertools.combinations(self.inputs,num):
            partitions.append(comb)
        return partitions
    
    
    def test(self,ratio, num=2,shuffle=True,random_state=40):
        """calculate efficiency of the model based on all the partition of the given inputs 
        
        ratio = cutting point between training data and testing data
        num = number of elements per partition
        shuffle = indicator on whether or not the data should be shuffled
        random_state = state that determines the shuffling algorithm
        
        """
        df = data[self.inputs + [self.output]]
        self.efficiency = {self.partition_inputs(num)[i]:0 for i in range(len(self.partition_inputs(num)))}
        df2 = self.split_data(df,ratio,shuffle,random_state)
      
        for key in self.efficiency:
            x_train = df2[0][list(key)].values
            y_train = df2[1].values
            x_test = df2[2][list(key)]
            y_test = df2[3].values
            self.model.fit(x_train,y_train)
            self.efficiency[key] = [self.model.score(x_test,y_test), balanced_accuracy_score(y_test, self.model.predict(x_test))]
       
        return self.efficiency
    
        

In [6]:
mnb = MultinomialNB()
tester = Tester(data, ["Support", "Resistance", "Hammer", "Williams%R", "ZeroCross", "SignalCross"], "Decision",mnb)

In [7]:
tester.test(0.2,num=1)

{('Support',): [0.6534653465346535, 0.3333333333333333],
 ('Resistance',): [0.6534653465346535, 0.3333333333333333],
 ('Hammer',): [0.6534653465346535, 0.3333333333333333],
 ('Williams%R',): [0.6534653465346535, 0.3333333333333333],
 ('ZeroCross',): [0.6534653465346535, 0.3333333333333333],
 ('SignalCross',): [0.6534653465346535, 0.3333333333333333]}

In [8]:
for i in range(1,6):
    print(tester.test(0.2,num=i))

{('Support',): [0.6534653465346535, 0.3333333333333333], ('Resistance',): [0.6534653465346535, 0.3333333333333333], ('Hammer',): [0.6534653465346535, 0.3333333333333333], ('Williams%R',): [0.6534653465346535, 0.3333333333333333], ('ZeroCross',): [0.6534653465346535, 0.3333333333333333], ('SignalCross',): [0.6534653465346535, 0.3333333333333333]}
{('Support', 'Resistance'): [0.6534653465346535, 0.3333333333333333], ('Support', 'Hammer'): [0.6485148514851485, 0.3380867498514557], ('Support', 'Williams%R'): [0.6534653465346535, 0.3333333333333333], ('Support', 'ZeroCross'): [0.6534653465346535, 0.3333333333333333], ('Support', 'SignalCross'): [0.6534653465346535, 0.3333333333333333], ('Resistance', 'Hammer'): [0.6534653465346535, 0.3333333333333333], ('Resistance', 'Williams%R'): [0.6534653465346535, 0.3333333333333333], ('Resistance', 'ZeroCross'): [0.6534653465346535, 0.3333333333333333], ('Resistance', 'SignalCross'): [0.6534653465346535, 0.3333333333333333], ('Hammer', 'Williams%R'): 

### Scrap work

In [17]:
buys = data.loc[data["Decision"] == 1].index
holds = data.loc[data["Decision"] == 0].index
sells = data.loc[data["Decision"] == 2].index

In [22]:
print(len(buys) / len(data))
print(len(holds) / len(data))
print(len(sells) / len(data))

0.1727904667328699
0.6554121151936445
0.1717974180734856


In [68]:
train = data.loc[buys].iloc[0:int(9*len(buys)/10)] 
train = train.append(data.loc[sells].iloc[0:int(9*len(sells)/10)])
train = train.append(data.loc[holds].iloc[0:int(9*len(holds)/15)])
train["Decision"].value_counts()

0    396
1    156
2    155
Name: Decision, dtype: int64

In [67]:
test = data.loc[buys].iloc[int(9*len(buys) / 10) : len(buys)]
test = test.append(data.loc[sells].iloc[int(9*len(sells) / 10) : len(sells)])
test = test.append(data.loc[holds].iloc[int(9*len(holds) / 15) : len(holds)])
test["Decision"].value_counts()

0    264
1     18
2     18
Name: Decision, dtype: int64

In [75]:
train = train[['Support', 'Resistance', 'Hammer', 'Williams%R', 'ZeroCross', 'SignalCross', 'Decision']]
test = test[['Support', 'Resistance', 'Hammer', 'Williams%R', 'ZeroCross', 'SignalCross', 'Decision']]

In [76]:
clf = MultinomialNB()
clf.fit(train.iloc[:,:-1], train.iloc[:,-1])

MultinomialNB()

In [77]:
clf.score(test.iloc[:,:-1], test.iloc[:,-1])

0.8733333333333333

In [79]:
pred = clf.predict(test.iloc[:,:-1])

In [82]:
print(balanced_accuracy_score(test.iloc[:,-1], pred))

0.348063973063973


In [85]:
test["Prediction"] = pred

In [86]:
test

Unnamed: 0,Support,Resistance,Hammer,Williams%R,ZeroCross,SignalCross,Decision,Prediction
940,0,0,0,0,1.0,1.0,1,0
945,0,0,0,0,1.0,0.0,1,0
950,0,0,0,0,1.0,0.0,1,0
954,0,0,0,2,1.0,0.0,1,0
964,1,0,0,0,1.0,0.0,1,0
...,...,...,...,...,...,...,...,...
1035,0,0,0,0,0.0,1.0,0,0
1036,0,1,0,1,0.0,1.0,0,0
1037,0,0,0,0,0.0,1.0,0,0
1038,0,0,0,1,0.0,1.0,0,0
