In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing   import LabelEncoder

from sklearn.tree            import DecisionTreeClassifier
from sklearn.naive_bayes     import GaussianNB

dfClassified = pd.read_csv("classification_datatTemp.csv").head(100)

In [14]:
# Convert string columns into numeric - 'Hotel Name'
hotelNames = dfClassified["Hotel Name"]
le = LabelEncoder()
numericHotels = list(le.fit_transform(hotelNames.tolist()))
dfClassified["Hotel Name"] = pd.Series((numericName for numericName in numericHotels))

In [15]:
# Convert Dates columns into numeric - 'Checkin Date', 'Snapshot Date', 'DayDiff', 'WeekDay'

# Set Date & numeric columns types
dfClassified['Checkin Date']  = pd.to_datetime(dfClassified['Checkin Date'])
dfClassified['Snapshot Date'] = pd.to_datetime(dfClassified['Snapshot Date'])
dfClassified['DayDiff']       = pd.to_numeric(dfClassified['DayDiff'])

# Change columns to have numeric values for later ML algorithms usage
dfClassified['WeekDay']       = dfClassified['Checkin Date'].apply(lambda x: x.weekday())
dfClassified['Checkin Date']  = dfClassified['Checkin Date'].apply(lambda x: x.timestamp())
dfClassified['Snapshot Date'] = dfClassified['Snapshot Date'].apply(lambda x: x.timestamp())
dfClassified['DayDiff']       = dfClassified['DayDiff'].apply(lambda x: float(x))

In [16]:
data = pd.DataFrame(dfClassified, columns=['Snapshot Date','Checkin Date','DayDiff','Hotel Name','WeekDay'])
target = pd.DataFrame(dfClassified, columns=['Discount Code'])

trainData, testData, trainTarget, testTarget = train_test_split(data, target)

In [17]:
dt = DecisionTreeClassifier()
dt.fit(trainData, trainTarget)
print('Accuracy of Decision Tree classifier on training set: {:.2f}'.format(dt.score(trainData, trainTarget)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'    .format(dt.score(testData, testTarget)))

Accuracy of Decision Tree classifier on training set: 1.00
Accuracy of Decision Tree classifier on test set: 0.24


In [18]:
nb = GaussianNB()
nb.fit(trainData, trainTarget)

print('Accuracy of Naive Bayes classifier on training set: {:.2f}'.format(nb.score(trainData, trainTarget)))
print('Accuracy of Naive Bayes classifier on test set: {:.2f}'    .format(nb.score(testData, testTarget)))

Accuracy of Naive Bayes classifier on training set: 0.37
Accuracy of Naive Bayes classifier on test set: 0.36


  y = column_or_1d(y, warn=True)


In [19]:
##### The differences between Algorithms
As we see, Decision tree algorithm is highly superior than Naive Bayes algorithm in our case.

General information
-------------------
The Naive Bayes based on the Bayes law in statistics.
The algorithm is considered to be much easier to implement than the decision tree and generic for many different cases.
It works best when parameters are independent,
In our case, There is dependent between parameters [like Checkin Date & WeekDay), and it is probably explains the results.

The main disadvantege of the decision tree algorithm, is that it tends to overfit that data - 
as in the classifier has such a strong structure from the test data, that a new piece of information unknown to it might cause it to give incorrect results.
this can be handled by pruning.

SyntaxError: invalid syntax (<ipython-input-19-f6a8fb32947a>, line 2)

In [20]:
# ['Snapshot Date', 'Checkin Date', 'DayDiff', 'Hotel Name', 'WeekDay']
dt.feature_importances_

array([0.        , 0.11484264, 0.09524099, 0.66381057, 0.1261058 ])

In [21]:

Parameter influence order - order by desc.
Hotel Name
*Snapshot Date
*Checkin Date
*DayDiff
*WeekDay

SyntaxError: invalid syntax (<ipython-input-21-850f9cd22291>, line 2)

In [22]:
numOfClasses = dfClassified["Discount Code"].unique().size

In [23]:
from sklearn.metrics import confusion_matrix
dtPredicated = dt.predict(testData)
confusion_matrix = confusion_matrix(testTarget, dtPredicated)
confusion_matrix

array([[1, 4, 1, 0],
       [2, 3, 4, 1],
       [2, 3, 2, 1],
       [1, 0, 0, 0]], dtype=int64)

In [24]:
dtPredicated

array([1., 3., 2., 1., 2., 1., 4., 2., 2., 3., 3., 1., 4., 3., 2., 2., 2.,
       3., 1., 2., 3., 2., 3., 1., 2.])

In [25]:
testTarget

Unnamed: 0,Discount Code
84,2.0
21,2.0
77,1.0
45,1.0
90,1.0
7,4.0
39,3.0
30,1.0
36,2.0
22,2.0


In [26]:
confusion_matrix.sum(axis=0)

array([ 6, 10,  7,  2], dtype=int64)

In [27]:
confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)

array([5, 7, 6, 1], dtype=int64)

In [30]:
tp = np.diag(confusion_matrix)
fp = confusion_matrix.sum(axis=0) - np.diag(confusion_matrix)  
fn = confusion_matrix.sum(axis=1) - np.diag(confusion_matrix)
tn = confusion_matrix.sum() - (tp + fp + fn)

fnPerClass = []
fpPerClass = []
for i in range(numOfClasses):
    fnPerClass.insert(i, float(fn[i]) / (tp[i]+fn[i]))
    fpPerClass.insert(i, float(fp[i]) / (fp[i]+tn[i]))
    

print('False Negative')
print(fnPerClass)
print('False Poisitive')
print(fpPerClass)

False Negative
[0.8333333333333334, 0.7, 0.75, 1.0]
False Poisitive
[0.2631578947368421, 0.4666666666666667, 0.29411764705882354, 0.08333333333333333]


In [35]:
testTarget.as_matrix()

  """Entry point for launching an IPython kernel.


array([[2.],
       [2.],
       [1.],
       [1.],
       [1.],
       [4.],
       [3.],
       [1.],
       [2.],
       [2.],
       [3.],
       [3.],
       [2.],
       [1.],
       [1.],
       [2.],
       [2.],
       [2.],
       [2.],
       [3.],
       [2.],
       [3.],
       [3.],
       [3.],
       [3.]])

In [29]:
# Lets calculate the ROC for each discount code
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt

exp = label_binarize(testTarget.as_matrix(), classes=[1, 2, 3, 4])
pred = label_binarize(dtPredicated, classes=[1, 2, 3, 4])

fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(numOfClasses):
    fpr[i], tpr[i], _ = roc_curve(exp[:, i], pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

for i in range(numOfClasses):
    plt.figure()
    plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f)' % roc_auc[i])
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic ' + 'discount code ' + str(i+1))
    plt.legend(loc="lower right")
    plt.show()

  


<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

In [39]:
roc_auc

{0: 0.45175438596491235,
 1: 0.4166666666666667,
 2: 0.4779411764705882,
 3: 0.4583333333333333}

In [41]:
fpr

{0: array([0.        , 0.26315789, 1.        ]),
 1: array([0.        , 0.46666667, 1.        ]),
 2: array([0.        , 0.29411765, 1.        ]),
 3: array([0.        , 0.08333333, 1.        ])}

In [40]:
tpr

{0: array([0.        , 0.16666667, 1.        ]),
 1: array([0. , 0.3, 1. ]),
 2: array([0.  , 0.25, 1.  ]),
 3: array([0., 0., 1.])}