In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.tree import DecisionTreeClassifier

%matplotlib inline

In [2]:
df = pd.read_csv("data_test.csv")

In [3]:
df.shape
df.tail(5)
# df.isnull().values.any()

Unnamed: 0,age,gender,budget,price,cuisine_type,rating
15588,35_49,male,30_to_50,under_20,Deli/Sandwiches/Fast Food,satisfactory
15589,35_49,female,under_20,20_to_30,Deli/Sandwiches/Fast Food,dislike
15590,35_49,male,under_20,20_to_30,Continental,very good
15591,65_and_over,male,under_20,under_20,American,excellent
15592,35_49,male,30_to_50,under_20,Asian,excellent


In [4]:
age_map = {"35_49":3 , "under_19": 1, "20_34":2 , "50_64":4 ,"65_and_over":5 }
df["age"] = df["age"].map(age_map)
gender_map = {"female":0, "male":1}
df["gender"] = df["gender"].map(gender_map)
budget_map = {'under_20':1, '20_to_30':2, 'over_50':3, '30_to_50':4}
df["budget"] = df["budget"].map(budget_map)
price_map = {'under_20':1, '20_to_30':2, 'over_50':3, '30_to_50':4}
df["price"] = df["price"].map(price_map)
cuisine_map = {'Latin American/Mexican':1, 'American':2, 'Asian':3, 'Bars/Pubs':4,'Deli/Sandwiches/Fast Food':5, 'Continental':6, 'African':7, 'Breakfast/Brunch':8, 'Seafood':9, 'Mediterranean':10, 'Vegetarian/vegan':11,'Cafe':12}
df["cuisine_type"] = df["cuisine_type"].map(cuisine_map)
df.head(5)

Unnamed: 0,age,gender,budget,price,cuisine_type,rating
0,3,0,1,2,1,very good
1,3,1,1,4,2,very good
2,4,1,1,4,3,dislike
3,2,0,1,2,4,very good
4,4,1,1,1,5,satisfactory


In [5]:
# check rating ratio
rating_1 = len(df.loc[df['rating'] == 'dislike'])
print("Number of dislike: {0} ({1:2.2f}%)".format(rating_1, (rating_1 / 15592) * 100))
rating_2 = len(df.loc[df['rating'] == 'satisfactory'])
print("Number of satisfactory: {0} ({1:2.2f}%)".format(rating_2, (rating_2 / 15592) * 100))
rating_3 = len(df.loc[df['rating'] == 'very good'])
print("Number of very good: {0} ({1:2.2f}%)".format(rating_3, (rating_3 / 15592) * 100))
rating_4 = len(df.loc[df['rating'] == 'excellent'])
print("Number of excellent: {0} ({1:2.2f}%)".format(rating_4, (rating_4 / 15592) * 100))

Number of dislike: 1306 (8.38%)
Number of satisfactory: 3040 (19.50%)
Number of very good: 4234 (27.15%)
Number of excellent: 7013 (44.98%)


In [6]:
X = df[['age', 'gender', 'budget', 'price', 'cuisine_type']].values
X[0:5]

array([[3, 0, 1, 2, 1],
       [3, 1, 1, 4, 2],
       [4, 1, 1, 4, 3],
       [2, 0, 1, 2, 4],
       [4, 1, 1, 1, 5]], dtype=int64)

In [7]:
from sklearn import preprocessing
y = df["rating"]
y[0:5]

0       very good
1       very good
2         dislike
3       very good
4    satisfactory
Name: rating, dtype: object

In [8]:
#### Setting up the Decision Tree
from sklearn.model_selection import train_test_split
X_trainset, X_testset, y_trainset, y_testset = train_test_split(X, y, test_size=0.3, random_state=3)
X_trainset.shape
# y_trainset.shape

(10915, 5)

In [9]:
#### Modeling
ratingTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
ratingTree

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [10]:
ratingTree.fit(X_trainset,y_trainset)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [11]:
#### Prediction
predTree = ratingTree.predict(X_testset)
print (predTree [0:5])
print (y_testset [0:5])

['excellent' 'satisfactory' 'very good' 'satisfactory' 'excellent']
5798        excellent
13375    satisfactory
12511    satisfactory
2281     satisfactory
15145       excellent
Name: rating, dtype: object


In [12]:
#### Evaluation
from sklearn import metrics
import matplotlib.pyplot as plt
print("DecisionTrees's Accuracy: ", metrics.accuracy_score(y_testset, predTree))

DecisionTrees's Accuracy:  0.7428388200085506


In [13]:
#### Visualization
from sklearn.externals.six import StringIO
import pydotplus
import matplotlib.image as mpimg
from sklearn import tree
%matplotlib inline 



In [14]:
dot_data = StringIO()
filename = "ratingTree.png"
featureNames = df.columns[0:5]
targetNames = df["rating"].unique().tolist()
out=tree.export_graphviz(ratingTree,feature_names=featureNames, out_file=dot_data, class_names= np.unique(y_trainset), filled=True,  special_characters=True,rotate=False)  
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png(filename)
img = mpimg.imread(filename)
plt.figure(figsize=(100, 200))
plt.imshow(img,interpolation='nearest')

InvocationException: GraphViz's executables not found