In [52]:
import pandas as pd
import numpy as np

In [53]:
df = pd.read_csv('cleaned_data.csv')
df = df[['Support', 'Resistance', 'Hammer', 'Williams%R', 'ZeroCross', 'SignalCross', 'Decision14','Decision28']]
df = df.loc[df["Decision28"] != -1] # we exclude the value with a decision of -1
df = df.drop('Decision14', axis =1)

In [54]:
def cal_prior(df, Y):

    classes = sorted(list(df[Y].unique()))
    prior = []

    for i in classes:
        prior.append(len(df[df[Y] == i]) / len(df[Y]))
    
    return prior

In [55]:
def cal_likelihood(df, feature_name, feature_value, Y, label):

    df = df[df[Y] == label]
    mean = df[feature_name].mean()
    std = df[feature_name].std()

    p_x_given_y = len(df[df[feature_name] == feature_value]) / len(df)

    return p_x_given_y

In [56]:
def naive_bayes(df, X, Y):

    features = list(df.columns)[:-1]
    prior = cal_prior(df, Y)

    Y_pred = []

    for x in X:
        labels = sorted(list(df[Y].unique()))
        likelihood = [1]*len(labels)
        for i in range(len(labels)):
            for j in range(len(features)):
                likelihood[i] *= cal_likelihood(df, features[j], x[j], Y, labels[i])
        
        # Calculate posterior probability (we ignore denominator)
        post_prob = [1]*len(labels)
        for i in range(len(labels)):
            post_prob[i] = likelihood[i] * prior[i]
        
        Y_pred.append(np.argmax(post_prob))

    return np.array(Y_pred)

In [57]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.294, random_state=40, shuffle = False)

X_test = test.iloc[:, :-1].values
Y_test = test.iloc[:, -1].values
Y_pred = naive_bayes(train, X=X_test, Y='Decision28')

from sklearn.metrics import confusion_matrix, f1_score
from sklearn.metrics import balanced_accuracy_score
print(confusion_matrix(Y_test, Y_pred))
print(f1_score(Y_test, Y_pred, average='micro'))
print(balanced_accuracy_score(Y_test, Y_pred))

[[ 12 117]
 [  5 155]]
0.5778546712802768
0.5308866279069767


## Comparing results with pre-built models
### taken on https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.CategoricalNB.html#sklearn.naive_bayes.CategoricalNB

In [58]:
from sklearn.naive_bayes import CategoricalNB

In [59]:
classifier = CategoricalNB()
classifier.fit(train.iloc[:,:-1].values, train.iloc[:,-1].values)

CategoricalNB()

In [60]:
Y_pred2 = classifier.predict(test.iloc[:, :-1].values)

In [61]:
print(confusion_matrix(Y_test, Y_pred2))
print(f1_score(Y_test, Y_pred2, average='micro'))

[[ 12 117]
 [  5 155]]
0.5778546712802768


In [62]:
classifier.score(test.iloc[:, :-1].values,Y_test)

0.5778546712802768

In [63]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB

In [64]:
mnb = MultinomialNB()
mnb.fit(train.iloc[:,:-1].values, train.iloc[:,-1].values)
mnb.predict(test.iloc[:, :-1].values)
print(mnb.score(test.iloc[:, :-1].values,Y_test))
print(balanced_accuracy_score(Y_test, mnb.predict(test.iloc[:, :-1].values)))

0.5813148788927336
0.5347625968992248


In [65]:
cnb = ComplementNB()
cnb.fit(train.iloc[:,:-1].values, train.iloc[:,-1].values)
cnb.predict(test.iloc[:, :-1].values)
print(cnb.score(test.iloc[:, :-1].values,Y_test))
print(balanced_accuracy_score(Y_test, cnb.predict(test.iloc[:, :-1].values)))

0.532871972318339
0.5646075581395349


In [66]:
pred = mnb.predict(test.iloc[:, :-1].values)

In [67]:
pred2 = cnb.predict(test.iloc[:, :-1].values)
print(confusion_matrix(Y_test,pred2))

[[111  18]
 [117  43]]


In [68]:
print(confusion_matrix(Y_test,pred))

[[ 13 116]
 [  5 155]]


In [69]:
mnb.classes_

array([0, 1], dtype=int64)

In [70]:
pred

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1], dtype=int64)

In [71]:
train['Decision28'].value_counts()

1    434
0    258
Name: Decision28, dtype: int64

In [72]:
test['Decision28'].value_counts()

1    160
0    129
Name: Decision28, dtype: int64

In [73]:
# we can see that the buy/sell ratio is not equivalent between the testing data and the training data