# in-vehicle coupon recommendation

We will look at the Naive Bayes classifier.

In [4]:
from pandas import Series, DataFrame
import pandas as pd
from patsy import dmatrices
%pylab inline
%matplotlib inline

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


In [5]:
# We will ignore some silly warnings that pop up due to scikit-learn
import warnings
warnings.filterwarnings('ignore')

In [7]:
df = pd.read_csv('in-vehicle-coupon-recommendation.CSV')
df[:5]

Unnamed: 0,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,...,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
0,No Urgent Place,Alone,Sunny,55,2PM,Restaurant(<20),1d,Male,21,Single,...,less1,4~8,4~8,less1,1,0,0,0,1,1
1,No Urgent Place,Friend(s),Sunny,80,10AM,Coffee House,2h,Male,21,Single,...,less1,4~8,4~8,less1,1,0,0,0,1,0
2,No Urgent Place,Friend(s),Sunny,80,10AM,Bar,1d,Male,21,Single,...,less1,4~8,4~8,less1,1,0,0,0,1,1
3,No Urgent Place,Friend(s),Sunny,80,10AM,Carry out & Take away,2h,Male,21,Single,...,less1,4~8,4~8,less1,1,1,0,0,1,0
4,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,1d,Male,21,Single,...,less1,4~8,4~8,less1,1,0,0,0,1,0


In [8]:
df.describe()

Unnamed: 0,temperature,has_children,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
count,12079.0,12079.0,12079.0,12079.0,12079.0,12079.0,12079.0,12079.0
mean,63.334713,0.408478,1.0,0.561222,0.119381,0.215167,0.784833,0.569335
std,19.133246,0.491573,0.0,0.496258,0.324249,0.410955,0.410955,0.49519
min,30.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,55.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
50%,80.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
75%,80.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
max,80.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#### Create the target

In [9]:
df['target'] = df['Y']

In [10]:
df['target'].value_counts()

1    6877
0    5202
Name: target, dtype: int64

In [11]:
len(df)

12079

#### Create the design matrices

In [12]:
print(df.columns.values)

['destination' 'passanger' 'weather' 'temperature' 'time' 'coupon'
 'expiration' 'gender' 'age' 'maritalStatus' 'has_children' 'education'
 'occupation' 'income' 'Bar' 'CoffeeHouse' 'CarryAway'
 'RestaurantLessThan20' 'Restaurant20To50' 'toCoupon_GEQ5min'
 'toCoupon_GEQ15min' 'toCoupon_GEQ25min' 'direction_same' 'direction_opp'
 'Y' 'target']


For Naive Bayes, we want all dummy variables, but unfortunately standard patsy doesn't necessarily give us that. So, just for the Naive Bayes classifier, we will use a pandas-specific way of dummy encoding.

In [13]:
# Make a list of all our categorical variables
categorical_columns = ['destination', 'passanger', 'weather', 'temperature', 'time', 'coupon',
 'expiration', 'gender', 'age', 'maritalStatus', 'has_children', 'education',
 'occupation', 'income', 'Bar', 'CoffeeHouse', 'CarryAway',
 'RestaurantLessThan20', 'Restaurant20To50', 'toCoupon_GEQ5min',
 'toCoupon_GEQ15min', 'toCoupon_GEQ25min', 'direction_same', 'direction_opp']

In [14]:
# Use pandas to create dummy variables
df_dummies = pd.get_dummies(df[categorical_columns],
                            prefix=categorical_columns,
                            columns=categorical_columns)
dummy_column_names = df_dummies.columns.values
dummy_column_names[:10]

array(['destination_Home', 'destination_No Urgent Place',
       'destination_Work', 'passanger_Alone', 'passanger_Friend(s)',
       'passanger_Kid(s)', 'passanger_Partner', 'weather_Rainy',
       'weather_Snowy', 'weather_Sunny'], dtype=object)

In [15]:
# Concatenate all these new dummy columns into the old dataframe
df2 = pd.concat([df, df_dummies], axis=1)

In [16]:
formula = 'target ~ 0 + {}'.format(' + '.join(['Q("{}")'.format(x)\
                                               for x in dummy_column_names]))
print(formula)

target ~ 0 + Q("destination_Home") + Q("destination_No Urgent Place") + Q("destination_Work") + Q("passanger_Alone") + Q("passanger_Friend(s)") + Q("passanger_Kid(s)") + Q("passanger_Partner") + Q("weather_Rainy") + Q("weather_Snowy") + Q("weather_Sunny") + Q("temperature_30") + Q("temperature_55") + Q("temperature_80") + Q("time_10AM") + Q("time_10PM") + Q("time_2PM") + Q("time_6PM") + Q("time_7AM") + Q("coupon_Bar") + Q("coupon_Carry out & Take away") + Q("coupon_Coffee House") + Q("coupon_Restaurant(20-50)") + Q("coupon_Restaurant(<20)") + Q("expiration_1d") + Q("expiration_2h") + Q("gender_Female") + Q("gender_Male") + Q("age_21") + Q("age_26") + Q("age_31") + Q("age_36") + Q("age_41") + Q("age_46") + Q("age_50plus") + Q("age_below21") + Q("maritalStatus_Divorced") + Q("maritalStatus_Married partner") + Q("maritalStatus_Single") + Q("maritalStatus_Unmarried partner") + Q("maritalStatus_Widowed") + Q("has_children_0") + Q("has_children_1") + Q("education_Associates degree") + Q("edu

In [17]:
Y, X = dmatrices(formula, df2, return_type='dataframe')
y = Y['target'].values

#### Set up the classifier

In [18]:
from sklearn import naive_bayes
model = naive_bayes.MultinomialNB()

#### Fit the model

In [19]:
model.fit(X, y)

#### Test on some training data

In [21]:
print('Prediction')
print(model.predict(X[:10]))
print('Actual')
print(y[:10])

Prediction
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
Actual
[1. 0. 1. 0. 0. 0. 0. 1. 0. 1.]


In [22]:
from sklearn import metrics

prediction_train = model.predict(X)
print(metrics.accuracy_score(y, prediction_train))

0.6679360874244556


#### Class priors

* Class priors are encoded in the `model` variable, after fitting the model to the training data. 
* The priors are available from `model.class_log_prior_`
    * This is not the fraction of each class in the training data, but rather
    * the _logarithm_ of that.

In [23]:
print('Prior probability for the negative class is',)
print(exp(model.class_log_prior_[0]))
print('Prior probability for the positive class is',)
print(exp(model.class_log_prior_[1]))

Prior probability for the negative class is
0.4306647901316337
Prior probability for the positive class is
0.5693352098683663


#### What are the likelihoods?

There is a special attribute called `model.feature_log_prob_` that is available after fitting the training data.

In [24]:
# Log of the likelihoods for class 1 (positive class) for the first 5 features
model.feature_log_prob_[1][:5]

array([-4.65650045, -3.77562887, -4.68494451, -3.80565793, -4.35081694])

In [25]:
feature_stats = DataFrame({'Positive class':np.exp(model.feature_log_prob_[1]),
                           'Negative class':np.exp(model.feature_log_prob_[0])},
                           index=X.columns.values) # <-- feature names
feature_stats[:5]

Unnamed: 0,Positive class,Negative class
"Q(""destination_Home"")",0.0095,0.012147
"Q(""destination_No Urgent Place"")",0.022923,0.017493
"Q(""destination_Work"")",0.009233,0.012011
"Q(""passanger_Alone"")",0.022245,0.026384
"Q(""passanger_Friend(s)"")",0.012896,0.008162


#### Which features are most important?

A feature is important (i.e. very **discriminative**) if it is very _likely_ in one class but very _unlikely_ in the other.

Look at the ratio of $$P(\mbox{feature in positive class})$$ to $$P(\mbox{feature in negative class})$$

$$\mbox{If Ratio } \gg 1 \Rightarrow \mbox{presence of feature implies positive class}$$

$$\mbox{If Ratio } \ll 1 \Rightarrow \mbox{presence of feature implies negative class}$$

In [26]:
feature_stats['Positive/Negative Ratio'] = feature_stats['Positive class']\
                                         / feature_stats['Negative class']
feature_stats[:3]

Unnamed: 0,Positive class,Negative class,Positive/Negative Ratio
"Q(""destination_Home"")",0.0095,0.012147,0.782025
"Q(""destination_No Urgent Place"")",0.022923,0.017493,1.310388
"Q(""destination_Work"")",0.009233,0.012011,0.768703


$$\mbox{How can we measure if the ratio is far from 1?}$$.

$$\Rightarrow \log\left(Ratio\right) \mbox{ is very large in magnitude}$$

In [27]:
feature_stats['Importance'] = \
    np.abs(np.log(feature_stats['Positive/Negative Ratio']))
feature_stats[:3]

Unnamed: 0,Positive class,Negative class,Positive/Negative Ratio,Importance
"Q(""destination_Home"")",0.0095,0.012147,0.782025,0.245868
"Q(""destination_No Urgent Place"")",0.022923,0.017493,1.310388,0.270323
"Q(""destination_Work"")",0.009233,0.012011,0.768703,0.26305


How can we find the top features?

In [28]:
feature_stats.sort_values(by='Importance', ascending=False)[:10]

Unnamed: 0,Positive class,Negative class,Positive/Negative Ratio,Importance
"Q(""coupon_Carry out & Take away"")",0.01019,0.004793,2.125822,0.754158
"Q(""occupation_Healthcare Practitioners & Technical"")",0.000969,0.000512,1.891514,0.637378
"Q(""coupon_Bar"")",0.004777,0.009011,0.530161,0.634574
"Q(""education_Some High School"")",0.000387,0.000208,1.862414,0.621873
"Q(""coupon_Restaurant(<20)"")",0.011395,0.006186,1.842085,0.610898
"Q(""occupation_Production Occupations"")",0.000381,0.000216,1.765413,0.568385
"Q(""occupation_Healthcare Support"")",0.001029,0.000592,1.738148,0.55282
"Q(""toCoupon_GEQ25min_1"")",0.003796,0.006538,0.580651,0.543606
"Q(""occupation_Construction & Extraction"")",0.000648,0.000392,1.652179,0.502095
"Q(""coupon_Restaurant(20-50)"")",0.003833,0.00629,0.609327,0.4954


$$\mbox{If Ratio } \gg 1 \Rightarrow \mbox{presence of feature implies positive class}$$

* So "education_Doctorate" and "education_Prof-school" are markers of positive class (i.e., >=50K salary)