In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing   import LabelEncoder

from sklearn.tree            import DecisionTreeClassifier
from sklearn.naive_bayes     import GaussianNB

dfClassified = pd.read_csv("classification_datatTemp.csv").head(100)

In [None]:
# Convert string columns into numeric - 'Hotel Name'
hotelNames = dfClassified["Hotel Name"]
le = LabelEncoder()
numericHotels = list(le.fit_transform(hotelNames.tolist()))
dfClassified["Hotel Name"] = pd.Series((numericName for numericName in numericHotels))

In [None]:
# Convert Dates columns into numeric - 'Checkin Date', 'Snapshot Date', 'DayDiff', 'WeekDay'

# Set Date & numeric columns types
dfClassified['Checkin Date']  = pd.to_datetime(dfClassified['Checkin Date'])
dfClassified['Snapshot Date'] = pd.to_datetime(dfClassified['Snapshot Date'])
dfClassified['DayDiff']       = pd.to_numeric(dfClassified['DayDiff'])

# Change columns to have numeric values for later ML algorithms usage
dfClassified['WeekDay']       = dfClassified['Checkin Date'].apply(lambda x: x.weekday())
dfClassified['Checkin Date']  = dfClassified['Checkin Date'].apply(lambda x: x.timestamp())
dfClassified['Snapshot Date'] = dfClassified['Snapshot Date'].apply(lambda x: x.timestamp())
dfClassified['DayDiff']       = dfClassified['DayDiff'].apply(lambda x: float(x))

In [None]:
data = pd.DataFrame(dfClassified, columns=['Snapshot Date','Checkin Date','DayDiff','Hotel Name','WeekDay'])
target = pd.DataFrame(dfClassified, columns=['Discount Code'])

trainData, testData, trainTarget, testTarget = train_test_split(data, target)

In [None]:
dt = DecisionTreeClassifier()
dt.fit(trainData, trainTarget)
print('Accuracy of Decision Tree classifier on training set: {:.2f}'.format(dt.score(trainData, trainTarget)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'    .format(dt.score(testData, testTarget)))

In [None]:
nb = GaussianNB()
nb.fit(trainData, trainTarget)

print('Accuracy of Naive Bayes classifier on training set: {:.2f}'.format(nb.score(trainData, trainTarget)))
print('Accuracy of Naive Bayes classifier on test set: {:.2f}'    .format(nb.score(testData, testTarget)))

In [None]:
##### The differences between Algorithms
As we see, Decision tree algorithm is highly superior than Naive Bayes algorithm in our case.

General information
-------------------
The Naive Bayes based on the Bayes law in statistics.
The algorithm is considered to be much easier to implement than the decision tree and generic for many different cases.
It works best when parameters are independent,
In our case, There is dependent between parameters [like Checkin Date & WeekDay), and it is probably explains the results.

The main disadvantege of the decision tree algorithm, is that it tends to overfit that data - 
as in the classifier has such a strong structure from the test data, that a new piece of information unknown to it might cause it to give incorrect results.
this can be handled by pruning.

In [None]:
# ['Snapshot Date', 'Checkin Date', 'DayDiff', 'Hotel Name', 'WeekDay']
dt.feature_importances_

In [None]:

Parameter influence order - order by desc.
Hotel Name
*Snapshot Date
*Checkin Date
*DayDiff
*WeekDay

In [None]:
numOfClasses = dfClassified["Discount Code"].unique().size

In [None]:
def printFnAndFpIndices(confusion_matrix):
    tp = np.diag(confusion_matrix)
    fp = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)  
    fn = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
    tn = confusion_matrix.sum() - (tp + fp + fn)
    
    fnPerClass = []
    fpPerClass = []
    for i in range(numOfClasses):
        fnPerClass.insert(i, float(fn[i]) / (tp[i]+fn[i]))
        fpPerClass.insert(i, float(fp[i]) / (fp[i]+tn[i]))
        
    
    print('False Negative')
    print(fnPerClass)
    print('False Poisitive')
    print(fpPerClass)

In [None]:
from sklearn.metrics import confusion_matrix
dtPredicated = dt.predict(testData)
dt_confusion_matrix = confusion_matrix(testTarget, dtPredicated)
print("indices for decision tree")
printFnAndFpIndices(dt_confusion_matrix)
nbPredicated = nb.predict(testData)
nb_confusion_matrix = confusion_matrix(testTarget, dtPredicated)
print("indices for naive base")
printFnAndFpIndices(nb_confusion_matrix)


In [None]:
# Lets calculate the ROC for each discount code
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
def printRocGraph(expected, predicated) : 
    exp = label_binarize(expected.as_matrix(), classes=[1, 2, 3, 4])
    pred = label_binarize(predicated, classes=[1, 2, 3, 4])
    
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    
    for i in range(numOfClasses):
        fpr[i], tpr[i], _ = roc_curve(exp[:, i], pred[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    for i in range(numOfClasses):
        plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f)' % roc_auc[i])
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic ' + 'discount code ' + str(i+1))
        plt.legend(loc="lower right")
        plt.show()
        
print("Roc curve graph for decision tree al")
printRocGraph(testTarget, dtPredicated) 
printRocGraph(testTarget, nbPredicated) 
    
