# In-Vehicle coupon recommendation

We are going to look at Decision Trees.

In [2]:
pip install patsy





[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
from pandas import Series, DataFrame
import pandas as pd
from patsy import dmatrices
import warnings
%pylab inline
warnings.filterwarnings('ignore')

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


In [4]:
df = pd.read_csv('in-vehicle-coupon-recommendation.CSV')
df.columns.values

array(['destination', 'passanger', 'weather', 'temperature', 'time',
       'coupon', 'expiration', 'gender', 'age', 'maritalStatus',
       'has_children', 'education', 'occupation', 'income', 'Bar',
       'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20',
       'Restaurant20To50', 'toCoupon_GEQ5min', 'toCoupon_GEQ15min',
       'toCoupon_GEQ25min', 'direction_same', 'direction_opp', 'Y'],
      dtype=object)

How many rows?

In [5]:
len(df)

12079

Show me one.

In [6]:
df.iloc[0]

destination                        No Urgent Place
passanger                                    Alone
weather                                      Sunny
temperature                                     55
time                                           2PM
coupon                             Restaurant(<20)
expiration                                      1d
gender                                        Male
age                                             21
maritalStatus                               Single
has_children                                     0
education                         Bachelors degree
occupation              Architecture & Engineering
income                             $62500 - $74999
Bar                                          never
CoffeeHouse                                  less1
CarryAway                                      4~8
RestaurantLessThan20                           4~8
Restaurant20To50                             less1
toCoupon_GEQ5min               

In [7]:
df['Y'].value_counts()

Y
1    6877
0    5202
Name: count, dtype: int64

### Create the design matrices

In [8]:
Y, X = dmatrices('Y ~ 0 + destination + passanger + weather + temperature + time + coupon + \
 expiration + gender + age + maritalStatus + has_children + education + \
 occupation + income + Bar + CoffeeHouse + CarryAway + \
 RestaurantLessThan20 + Restaurant20To50 + toCoupon_GEQ5min + \
 toCoupon_GEQ15min + toCoupon_GEQ25min + direction_same + direction_opp', df, return_type='dataframe')
y = Y['Y'].values

In [9]:
X[:5]

Unnamed: 0,destination[Home],destination[No Urgent Place],destination[Work],passanger[T.Friend(s)],passanger[T.Kid(s)],passanger[T.Partner],weather[T.Snowy],weather[T.Sunny],time[T.10PM],time[T.2PM],...,Restaurant20To50[T.gt8],Restaurant20To50[T.less1],Restaurant20To50[T.never],temperature,has_children,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,55.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,80.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,80.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,80.0,0.0,1.0,1.0,0.0,0.0,1.0
4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,80.0,0.0,1.0,0.0,0.0,0.0,1.0


### Split data into train and test

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

### Set up classifier

In [11]:
from sklearn import tree
model = tree.DecisionTreeClassifier(criterion='entropy')

### Fit the data

In [12]:
result = model.fit(X_train, y_train)

In [13]:
from sklearn import metrics

prediction_train = model.predict(X_train)
print(metrics.accuracy_score(y_train, prediction_train))

0.9994086339444116


### Accuracy on test set

In [14]:
prediction = model.predict(X_test)
print(metrics.accuracy_score(y_test, prediction))

0.6633554083885209


This is far worse than the 92% accuracy we expected from the training set. Clearly, we are **overfitting**.

One way to avoid overfitting is to ensure that trees never become too deep, via the *max\_depth* argument.

In [15]:
model2 = tree.DecisionTreeClassifier(criterion='entropy', max_depth=10)
result = model2.fit(X_train, y_train)

### Accuracy

In [16]:
prediction_train = model2.predict(X_train)
print(metrics.accuracy_score(y_train, prediction_train))

0.7891188645771733


In [17]:
prediction = model2.predict(X_test)
print(metrics.accuracy_score(y_test, prediction))

0.6964679911699779
