### The goal of this script is to find the best combination of parameters to increase the accuracy of the model
#### To do so, we will compare different set of inputs passed in the multinomial naive bayes classifier from sklearn
#### https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB

In [1]:
#Import relevant libraries
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.model_selection import train_test_split
import itertools
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import balanced_accuracy_score

In [2]:
#Import the data
data = pd.read_csv("cleaned_data.csv")

In [3]:
#last cleanup
data = data.loc[data['Decision5'] != -1]
data

Unnamed: 0,Date,Close,Var. (%),Open,Low,High,Volume,Support,Resistance,Hammer,...,MACD LINE,SIGNAL LINE,HISTOGRAM,ZeroCross,SignalCross,Decision14,Decision28,Decision10,Decision50,Decision5
33,2016-03-15,6.61,-10.68,6.61,6.56,6.96,698485643,0,0,0.0,...,0.848068,0.782849,0.065219,1.0,1.0,1,1,1,1,1
34,2016-03-16,7.23,9.38,6.51,6.41,7.23,584312229,1,0,0.0,...,0.815708,0.789421,0.026287,1.0,1.0,1,1,1,1,1
35,2016-03-17,8.10,12.03,8.15,7.71,8.19,943356446,0,0,0.0,...,0.850460,0.801629,0.048831,1.0,1.0,0,1,1,1,0
36,2016-03-18,8.12,0.25,8.09,7.72,8.17,608847049,0,0,0.0,...,0.869591,0.815221,0.054370,1.0,1.0,0,1,1,1,0
37,2016-03-21,8.06,-0.74,8.08,7.93,8.27,526247185,0,0,0.0,...,0.869883,0.826154,0.043730,1.0,1.0,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1032,2020-03-27,13.30,-7.57%,13.30,13.25,13.69,"100,33M",0,0,0.0,...,-3.512608,-3.776371,0.263763,0.0,1.0,-1,-1,-1,-1,1
1033,2020-03-30,13.38,0.60%,13.15,12.83,13.74,"110,53M",0,0,0.0,...,-3.335915,-3.688280,0.352365,0.0,1.0,-1,-1,-1,-1,1
1034,2020-03-31,13.99,4.56%,13.60,13.60,14.54,"155,31M",0,0,0.0,...,-3.110803,-3.572785,0.461981,0.0,1.0,-1,-1,-1,-1,1
1035,2020-04-01,14.30,2.22%,13.45,13.32,14.55,"113,48M",0,0,0.0,...,-2.874254,-3.433078,0.558825,0.0,1.0,-1,-1,-1,-1,1


In [4]:
class Tester:

    def __init__(self,data, inputs,output,model):
        """
        data = data used
        inputs = inputs we want to test
        output = desired output
        efficiency = dictionary grouping the efficiency of each set of inputs
        model = model to be used
        """
        self.data = data
        self.inputs = inputs
        self.output = output
        self.efficiency = {}
        self.model = model
        
        
    def split_data(self,data,ratio,shuffle=True,random_state= 40):
        """
        data = data to be splitted
        ratio = cutting point between training data and testing data
        shuffle = indicator on whether or not the data should be shuffled
        random_state = state that determines the shuffling algorithm
        """
        train, test = train_test_split(data, test_size = ratio, random_state = random_state,shuffle = shuffle)
        X_test = test.iloc[:, :-1]
        X_train = train.iloc[:,:-1]
        Y_test = test.iloc[:, -1]
        Y_train = train.iloc[:,-1]
        return [X_train, Y_train, X_test,Y_test]
    
    
    def partition_inputs(self, num):
        """
        num = number of elements per partitions
        
        """
        partitions = []
        for comb in itertools.combinations(self.inputs,num):
            partitions.append(comb)
        return partitions
    
    
    def test(self,ratio, num=2,shuffle=True,random_state=40):
        """calculate efficiency of the model based on all the partition of the given inputs 
        
        ratio = cutting point between training data and testing data
        num = number of elements per partition
        shuffle = indicator on whether or not the data should be shuffled
        random_state = state that determines the shuffling algorithm
        
        """
        df = data[self.inputs + [self.output]]
        self.efficiency = {self.partition_inputs(num)[i]:0 for i in range(len(self.partition_inputs(num)))}
        df2 = self.split_data(df,ratio,shuffle,random_state)
      
        for key in self.efficiency:
            x_train = df2[0][list(key)].values
            y_train = df2[1].values
            x_test = df2[2][list(key)]
            y_test = df2[3].values
            self.model.fit(x_train,y_train)
            self.efficiency[key] = [self.model.score(x_test,y_test), balanced_accuracy_score(y_test, self.model.predict(x_test))]
       
        return self.efficiency
    
        

In [5]:
cnb = ComplementNB() 
tester = Tester(data, ["Support", "Resistance", "Hammer", "Williams%R", "ZeroCross", "SignalCross"], "Decision5",cnb)

In [6]:
maxx = []
for i in range(1,7):
    maxx.append((tester.test(0.294,num=i, shuffle = False)))

### Finding max 

In [7]:
maxx

[{('Support',): [0.44932432432432434, 0.5],
  ('Resistance',): [0.44932432432432434, 0.5],
  ('Hammer',): [0.44932432432432434, 0.5],
  ('Williams%R',): [0.44932432432432434, 0.5],
  ('ZeroCross',): [0.44932432432432434, 0.5],
  ('SignalCross',): [0.44932432432432434, 0.5]},
 {('Support', 'Resistance'): [0.4864864864864865, 0.5323585036210158],
  ('Support', 'Hammer'): [0.4864864864864865, 0.5323585036210158],
  ('Support', 'Williams%R'): [0.4864864864864865, 0.5323585036210158],
  ('Support', 'ZeroCross'): [0.4864864864864865, 0.5323585036210158],
  ('Support', 'SignalCross'): [0.4864864864864865, 0.5323585036210158],
  ('Resistance', 'Hammer'): [0.4222972972972973, 0.4657733290280917],
  ('Resistance', 'Williams%R'): [0.5101351351351351, 0.5220028599105124],
  ('Resistance', 'ZeroCross'): [0.5337837837837838, 0.5282531482079431],
  ('Resistance', 'SignalCross'): [0.5405405405405406, 0.5503021357073665],
  ('Hammer', 'Williams%R'): [0.5033783783783784, 0.5117164075833757],
  ('Hammer'