### The goal of this script is to find the best combination of parameters to increase the accuracy of the model
#### To do so, we will compare different set of inputs passed in the multinomial naive bayes classifier from sklearn
#### https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB

In [1]:
#Import relevant libraries
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
import itertools
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import balanced_accuracy_score

In [2]:
#Import the data
data = pd.read_csv("cleaned_data.csv")

In [4]:
#last cleanup
data = data[33:-2]
data

Unnamed: 0,Date,Close,Var. (%),Open,Low,High,Volume,Support,Resistance,Hammer,WilliamR,Williams%R,12EMA,26 EMA,MACD LINE,SIGNAL LINE,HISTOGRAM,ZeroCross,SignalCross,Decision
66,2016-05-03,9.78,-3.83,10.01,9.71,10.09,458544376,0,0,0.0,-0.460000,0,9.715492,9.068431,0.647062,0.665274,-0.018213,1.0,0.0,1
67,2016-05-04,9.92,1.43,9.85,9.85,10.15,428761814,0,0,0.0,-0.380000,0,9.746955,9.131510,0.615445,0.655309,-0.039863,1.0,0.0,1
68,2016-05-05,9.81,-1.11,10.28,9.60,10.30,647812356,0,1,0.0,-0.440000,0,9.756654,9.181768,0.574886,0.639224,-0.064338,1.0,0.0,0
69,2016-05-06,10.08,2.75,9.63,9.58,10.13,448185357,0,0,0.0,-0.280000,0,9.806400,9.248304,0.558096,0.622998,-0.064903,1.0,0.0,1
70,2016-05-09,9.48,-5.95,10.01,8.84,10.05,888858988,1,0,0.0,-0.620000,0,9.756184,9.265467,0.490718,0.596542,-0.105825,1.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1033,2020-03-30,13.38,0.60%,13.15,12.83,13.74,"110,53M",0,0,0.0,-0.597134,0,14.265699,17.601614,-3.335915,-3.688280,0.352365,0.0,1.0,1
1034,2020-03-31,13.99,4.56%,13.60,13.60,14.54,"155,31M",0,0,0.0,-0.309890,0,14.223283,17.334087,-3.110803,-3.572785,0.461981,0.0,1.0,1
1035,2020-04-01,14.30,2.22%,13.45,13.32,14.55,"113,48M",0,0,0.0,-0.241758,0,14.235086,17.109340,-2.874254,-3.433078,0.558825,0.0,1.0,1
1036,2020-04-02,15.51,8.46%,15.40,15.01,16.55,"233,81M",0,1,0.0,-0.182456,1,14.431227,16.990870,-2.559643,-3.258391,0.698748,0.0,1.0,1


In [5]:
class Tester:

    def __init__(self,data, inputs,output,model):
        """
        data = data used
        inputs = inputs we want to test
        output = desired output
        efficiency = dictionary grouping the efficiency of each set of inputs
        model = model to be used
        """
        self.data = data
        self.inputs = inputs
        self.output = output
        self.efficiency = {}
        self.model = model
        
        
    def split_data(self,data,ratio,shuffle=True,random_state= 40):
        """
        data = data to be splitted
        ratio = cutting point between training data and testing data
        shuffle = indicator on whether or not the data should be shuffled
        random_state = state that determines the shuffling algorithm
        """
        train, test = train_test_split(data, test_size = ratio, random_state = random_state,shuffle = shuffle)
        X_test = test.iloc[:, :-1]
        X_train = train.iloc[:,:-1]
        Y_test = test.iloc[:, -1]
        Y_train = train.iloc[:,-1]
        return [X_train, Y_train, X_test,Y_test]
    
    
    def partition_inputs(self, num):
        """
        num = number of elements per partitions
        
        """
        partitions = []
        for comb in itertools.combinations(self.inputs,num):
            partitions.append(comb)
        return partitions
    
    
    def test(self,ratio, num=2,shuffle=True,random_state=40):
        """calculate efficiency of the model based on all the partition of the given inputs 
        
        ratio = cutting point between training data and testing data
        num = number of elements per partition
        shuffle = indicator on whether or not the data should be shuffled
        random_state = state that determines the shuffling algorithm
        
        """
        df = data[self.inputs + [self.output]]
        self.efficiency = {self.partition_inputs(num)[i]:0 for i in range(len(self.partition_inputs(num)))}
        df2 = self.split_data(df,ratio,shuffle,random_state)
      
        for key in self.efficiency:
            x_train = df2[0][list(key)].values
            y_train = df2[1].values
            x_test = df2[2][list(key)]
            y_test = df2[3].values
            self.model.fit(x_train,y_train)
            self.efficiency[key] = [self.model.score(x_test,y_test), balanced_accuracy_score(y_test, self.model.predict(x_test))]
       
        return self.efficiency
    
        

In [6]:
mnb = MultinomialNB() #changed the prior
tester = Tester(data, ["Support", "Resistance", "Hammer", "Williams%R", "ZeroCross", "SignalCross"], "Decision",mnb)

In [7]:
tester.test(0.2,num=1)

{('Support',): [0.5794871794871795, 0.5],
 ('Resistance',): [0.5794871794871795, 0.5],
 ('Hammer',): [0.5794871794871795, 0.5],
 ('Williams%R',): [0.5794871794871795, 0.5],
 ('ZeroCross',): [0.5794871794871795, 0.5],
 ('SignalCross',): [0.5794871794871795, 0.5]}

In [8]:
maxx = []
for i in range(1,7):
    maxx.append((tester.test(0.2,num=i, shuffle = False)))

### Finding max 

In [11]:
maxx

[{('Support',): [0.5076923076923077, 0.5],
  ('Resistance',): [0.5076923076923077, 0.5],
  ('Hammer',): [0.5076923076923077, 0.5],
  ('Williams%R',): [0.5076923076923077, 0.5],
  ('ZeroCross',): [0.5076923076923077, 0.5],
  ('SignalCross',): [0.5076923076923077, 0.5]},
 {('Support', 'Resistance'): [0.5538461538461539, 0.5475063131313131],
  ('Support', 'Hammer'): [0.5230769230769231, 0.515625],
  ('Support', 'Williams%R'): [0.5128205128205128, 0.5080492424242424],
  ('Support', 'ZeroCross'): [0.5076923076923077, 0.5],
  ('Support', 'SignalCross'): [0.5076923076923077, 0.5],
  ('Resistance', 'Hammer'): [0.5076923076923077, 0.5],
  ('Resistance', 'Williams%R'): [0.5538461538461539, 0.5475063131313131],
  ('Resistance', 'ZeroCross'): [0.5538461538461539, 0.5475063131313131],
  ('Resistance', 'SignalCross'): [0.5538461538461539, 0.5475063131313131],
  ('Hammer', 'Williams%R'): [0.5230769230769231, 0.515625],
  ('Hammer', 'ZeroCross'): [0.5230769230769231, 0.515625],
  ('Hammer', 'SignalCro

In [10]:
print(list(maxx[1].values())[6])
print(list(maxx[2].values())[10])
print(list(maxx[3].values())[11])

[0.5538461538461539, 0.5475063131313131]
[0.5692307692307692, 0.5631313131313131]
[0.5692307692307692, 0.5631313131313131]
