In [22]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


df = pd.read_csv('/home/ubuntu/Documents/TripAdvisor.csv',sep=',',header=0)

cols = ['User country',  'Period of stay', 'Traveler type', 'Pool', 'Gym',
       'Tennis court', 'Spa', 'Casino', 'Free internet', 'Hotel name',
       'User continent',
       'Review month', 'Review weekday', 'Hotel stars']

labels = df['Score'].values
df.drop(['Score'], axis=1, inplace=True)

X_temp, X_test, y_train, y_test = train_test_split(df, labels, test_size=0.33, random_state=42)
 

In [23]:
from sklearn.base import BaseEstimator, TransformerMixin

class ToNumbers(BaseEstimator, TransformerMixin):
    def __init__(self, cols):
        self.cols = cols
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        for c in cols:
            encoded, categories = X[c].factorize()
            X[c] = encoded
        return X.values

In [24]:
pipeline = Pipeline([ 
    ('toNumbers', ToNumbers(cols)),
    ('scaler', StandardScaler())
])

X_train = pipeline.fit_transform(X_temp)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


In [25]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [26]:
predictions = classifier.predict(X_train)

In [27]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_train,predictions)
msen = np.sqrt(mse)
print(msen)

0.0


In [28]:
classifier.feature_importances_

array([0.09508109, 0.10709887, 0.05500134, 0.17332583, 0.03446109,
       0.03496408, 0.        , 0.        , 0.01538334, 0.00576372,
       0.00823388, 0.00332522, 0.0663271 , 0.03741995, 0.05568946,
       0.04238491, 0.07038175, 0.10975819, 0.08540018])

In [29]:
import graphviz
from sklearn import tree
dot_data = tree.export_graphviz(classifier, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("tripadvisor") 

'tripadvisor.pdf'

In [33]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(classifier,X_train,y_train,scoring="neg_mean_squared_error",cv=10)



In [34]:
from sklearn.model_selection import cross_val_predict
scores = cross_val_predict(classifier,X_train,y_train,cv=10)



In [35]:
# from Aurelien Geron
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard Deviation", scores.std())
    
display_scores(scores)
    

Scores: [4 3 2 5 4 5 5 5 5 5 5 4 4 5 5 5 4 4 5 4 5 5 3 4 4 4 5 5 5 4 2 5 5 4 5 4 3
 3 3 5 1 5 5 4 4 1 5 4 4 5 4 2 3 5 3 5 5 5 3 5 5 5 5 5 5 3 3 4 5 3 5 5 2 5
 5 5 5 5 5 3 2 3 4 3 5 2 5 1 5 5 5 4 4 5 2 5 2 5 4 5 5 4 5 4 4 4 5 5 5 5 5
 5 4 5 5 5 5 5 5 4 5 5 5 5 5 4 5 2 1 2 4 5 5 3 3 3 5 5 5 5 5 3 5 2 5 3 5 5
 3 4 5 5 3 3 3 2 4 4 5 4 5 3 4 3 4 5 5 4 3 5 5 4 5 5 4 4 4 3 3 5 4 1 4 5 4
 3 4 4 5 5 3 3 5 5 4 4 3 5 5 5 3 4 5 4 4 5 3 5 5 4 5 5 5 4 3 4 4 5 4 4 3 5
 5 2 5 5 3 4 4 5 3 3 5 3 5 5 4 3 5 4 4 2 4 4 4 5 2 3 4 4 4 3 4 4 4 5 5 4 3
 4 4 5 4 3 4 2 3 3 3 2 5 5 5 1 5 2 4 5 4 5 5 5 3 5 5 3 4 4 2 5 5 5 5 5 5 5
 1 4 4 2 3 4 4 3 5 4 5 5 5 3 5 4 5 5 4 5 5 5 5 5 4 5 3 2 4 5 5 5 5 4 4 3 4
 5 5 5 2]
Mean: 4.118694362017804
Standard Deviation 1.0353574556104257


In [38]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, predictions)    

array([[  7,   0,   0,   0,   0],
       [  0,  19,   0,   0,   0],
       [  0,   0,  53,   0,   0],
       [  0,   0,   0, 106,   0],
       [  0,   0,   0,   0, 152]])

In [41]:
y_train

array([4, 5, 4, 5, 4, 4, 4, 5, 5, 5, 5, 5, 5, 4, 3, 4, 5, 5, 5, 4, 3, 3,
       2, 3, 3, 3, 4, 5, 5, 5, 4, 5, 5, 5, 5, 4, 5, 4, 5, 5, 5, 4, 5, 4,
       5, 4, 1, 5, 5, 5, 3, 4, 3, 5, 4, 4, 2, 2, 4, 5, 5, 3, 2, 5, 4, 4,
       5, 4, 4, 4, 4, 3, 5, 5, 4, 5, 5, 3, 5, 2, 5, 4, 4, 4, 4, 5, 5, 5,
       3, 5, 5, 4, 5, 5, 4, 2, 5, 4, 4, 4, 5, 4, 5, 5, 5, 4, 5, 5, 4, 5,
       5, 4, 3, 5, 3, 5, 5, 4, 5, 4, 5, 5, 3, 4, 5, 3, 5, 3, 5, 1, 3, 5,
       5, 4, 5, 2, 1, 5, 5, 5, 4, 3, 5, 2, 5, 4, 4, 4, 5, 4, 5, 5, 4, 5,
       5, 3, 5, 4, 5, 4, 4, 4, 5, 5, 5, 4, 2, 3, 5, 5, 4, 4, 5, 3, 5, 4,
       3, 5, 5, 4, 3, 2, 5, 5, 5, 5, 4, 4, 5, 5, 4, 5, 4, 5, 5, 4, 3, 5,
       5, 3, 3, 3, 5, 5, 3, 5, 3, 4, 4, 3, 5, 5, 5, 5, 2, 4, 1, 4, 4, 5,
       3, 3, 4, 3, 1, 3, 4, 3, 4, 5, 5, 5, 4, 5, 2, 5, 5, 4, 5, 5, 4, 4,
       5, 3, 4, 5, 5, 4, 5, 4, 5, 4, 5, 3, 4, 4, 3, 4, 4, 4, 5, 3, 5, 4,
       5, 4, 4, 3, 3, 5, 5, 3, 3, 1, 4, 3, 5, 3, 5, 5, 2, 5, 5, 5, 2, 5,
       4, 5, 5, 2, 2, 2, 4, 4, 3, 4, 5, 5, 4, 3, 2,

In [42]:
predictions

array([4, 5, 4, 5, 4, 4, 4, 5, 5, 5, 5, 5, 5, 4, 3, 4, 5, 5, 5, 4, 3, 3,
       2, 3, 3, 3, 4, 5, 5, 5, 4, 5, 5, 5, 5, 4, 5, 4, 5, 5, 5, 4, 5, 4,
       5, 4, 1, 5, 5, 5, 3, 4, 3, 5, 4, 4, 2, 2, 4, 5, 5, 3, 2, 5, 4, 4,
       5, 4, 4, 4, 4, 3, 5, 5, 4, 5, 5, 3, 5, 2, 5, 4, 4, 4, 4, 5, 5, 5,
       3, 5, 5, 4, 5, 5, 4, 2, 5, 4, 4, 4, 5, 4, 5, 5, 5, 4, 5, 5, 4, 5,
       5, 4, 3, 5, 3, 5, 5, 4, 5, 4, 5, 5, 3, 4, 5, 3, 5, 3, 5, 1, 3, 5,
       5, 4, 5, 2, 1, 5, 5, 5, 4, 3, 5, 2, 5, 4, 4, 4, 5, 4, 5, 5, 4, 5,
       5, 3, 5, 4, 5, 4, 4, 4, 5, 5, 5, 4, 2, 3, 5, 5, 4, 4, 5, 3, 5, 4,
       3, 5, 5, 4, 3, 2, 5, 5, 5, 5, 4, 4, 5, 5, 4, 5, 4, 5, 5, 4, 3, 5,
       5, 3, 3, 3, 5, 5, 3, 5, 3, 4, 4, 3, 5, 5, 5, 5, 2, 4, 1, 4, 4, 5,
       3, 3, 4, 3, 1, 3, 4, 3, 4, 5, 5, 5, 4, 5, 2, 5, 5, 4, 5, 5, 4, 4,
       5, 3, 4, 5, 5, 4, 5, 4, 5, 4, 5, 3, 4, 4, 3, 4, 4, 4, 5, 3, 5, 4,
       5, 4, 4, 3, 3, 5, 5, 3, 3, 1, 4, 3, 5, 3, 5, 5, 2, 5, 5, 5, 2, 5,
       4, 5, 5, 2, 2, 2, 4, 4, 3, 4, 5, 5, 4, 3, 2,