In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('cleaned_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1042 entries, 0 to 1041
Data columns (total 24 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Date         1042 non-null   object 
 1   Close        1042 non-null   float64
 2   Var. (%)     1042 non-null   object 
 3   Open         1042 non-null   float64
 4   Low          1042 non-null   float64
 5   High         1042 non-null   float64
 6   Volume       1042 non-null   object 
 7   Support      1042 non-null   int64  
 8   Resistance   1042 non-null   int64  
 9   Hammer       1042 non-null   float64
 10  WilliamR     1042 non-null   float64
 11  Williams%R   1042 non-null   int64  
 12  12EMA        1042 non-null   float64
 13  26 EMA       1042 non-null   float64
 14  MACD LINE    1042 non-null   float64
 15  SIGNAL LINE  1042 non-null   float64
 16  HISTOGRAM    1042 non-null   float64
 17  ZeroCross    1009 non-null   float64
 18  SignalCross  1009 non-null   float64
 19  Decisi

In [3]:
Y_list = ['Decision14', 'Decision28', 'Decision10', 'Decision50', 'Decision5']

for y in Y_list:
    df_temp = df[['Support', 'Resistance', 'Hammer', 'ZeroCross','SignalCross', y]]
    df_temp = df_temp.loc[df_temp[y] != -1] # we exclude the value with a decision of -1

    def cal_prior(df, Y):

        classes = sorted(list(df[Y].unique()))
        prior = []

        for i in classes:
            prior.append(len(df[df[Y] == i]) / len(df[Y]))
        
        return prior

    def cal_likelihood(df, feature_name, feature_value, Y, label):

        df = df[df[Y] == label]
        mean = df[feature_name].mean()
        std = df[feature_name].std()

        p_x_given_y = len(df[df[feature_name] == feature_value]) / len(df)

        return p_x_given_y    

    def naive_bayes(df, X, Y):

        features = list(df.columns)[:-1]
        prior = cal_prior(df, Y)

        Y_pred = []

        for x in X:
            labels = sorted(list(df[Y].unique()))
            likelihood = [1]*len(labels)
            for i in range(len(labels)):
                for j in range(len(features)):
                    likelihood[i] *= cal_likelihood(df, features[j], x[j], Y, labels[i])
            
            # Calculate posterior probability (we ignore denominator)
            post_prob = [1]*len(labels)
            for i in range(len(labels)):
                post_prob[i] = likelihood[i] * prior[i]
            
            Y_pred.append(np.argmax(post_prob))

        return np.array(Y_pred)

    from sklearn.model_selection import train_test_split
    train, test = train_test_split(df_temp, test_size=0.294, random_state=40, shuffle = False)

    X_test = test.iloc[:, :-1].values
    Y_test = test.iloc[:, -1].values
    Y_pred = naive_bayes(train, X=X_test, Y=y)

    from sklearn.metrics import confusion_matrix, f1_score
    from sklearn.metrics import balanced_accuracy_score
    print(f'The confusion matrix of {y} is:\n{confusion_matrix(Y_test, Y_pred)}')
    print(f'\nThe f1 score of {y} is:\n{f1_score(Y_test, Y_pred, average="micro")}')
    print(f'\nThe balanced accuracy score of {y} is:\n{balanced_accuracy_score(Y_test, Y_pred)}')
    print('---------------------------------------')

The confusion matrix of Decision14 is:
[[  0 137]
 [  3 153]]

The f1 score of Decision14 is:
0.5221843003412969

The balanced accuracy score of Decision14 is:
0.49038461538461536
---------------------------------------
The confusion matrix of Decision28 is:
[[ 12 117]
 [  5 155]]

The f1 score of Decision28 is:
0.5778546712802768

The balanced accuracy score of Decision28 is:
0.5308866279069767
---------------------------------------
The confusion matrix of Decision10 is:
[[ 17 129]
 [  9 139]]

The f1 score of Decision10 is:
0.5306122448979592

The balanced accuracy score of Decision10 is:
0.5278137726767863
---------------------------------------
The confusion matrix of Decision50 is:
[[ 12  93]
 [  4 173]]

The f1 score of Decision50 is:
0.6560283687943262

The balanced accuracy score of Decision50 is:
0.5458434221146086
---------------------------------------
The confusion matrix of Decision5 is:
[[ 19 114]
 [  8 155]]

The f1 score of Decision5 is:
0.5878378378378378

The balance

In [4]:
# Decision5 has the highest balanced accuracy score

In [5]:
# Compare the buy/sell ratio of the training dataset from Decision5
ratio = len(train[train.iloc[:,-1] == 1]) / len(train[train.iloc[:,-1] == 0])
ratio

1.321311475409836

In [6]:
# Compare the buy/sell ratio of the testing dataset from Decision5
ratio = len(test[test.iloc[:,-1] == 1]) / len(test[test.iloc[:,-1] == 0])
ratio

1.2255639097744362

In [7]:
# we can see that the buy/sell ratio is not equivalent between the testing data and the training data thus we will use
# a slightly different naive bayes classifier called complement naive bayes that corrects the imbalances