In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.metrics import r2_score, classification_report, confusion_matrix, accuracy_score, f1_score, precision_score,recall_score
from sklearn.model_selection import train_test_split
from patsy import dmatrices
%pylab inline


Populating the interactive namespace from numpy and matplotlib


In [2]:
data = pd.read_csv('Aemf1 2.csv')
data.columns


Index(['City', 'Price', 'Day', 'Room Type', 'Shared Room', 'Private Room',
       'Person Capacity', 'Superhost', 'Multiple Rooms', 'Business',
       'Cleanliness Rating', 'Guest Satisfaction', 'Bedrooms',
       'City Center (km)', 'Metro Distance (km)', 'Attraction Index',
       'Normalised Attraction Index', 'Restraunt Index',
       'Normalised Restraunt Index'],
      dtype='object')

In [3]:
data.rename(columns={'Shared Room':'Shared_Room',
                     'Guest Satisfaction':'Guest_Satisfaction',
                            'Multiple Rooms':'Multiple_Rooms',
                            'Cleanliness Rating':'Cleanliness_Rating',
                            'City Center (km)':'City_Center_km',
                            'Metro Distance (km)':'Metro_Distance_km',
                            'Attraction Index':'Attraction_Index',
                            'Normalised Attraction Index':'Normalised_Attraction_Index',
                            'Restraunt Index':'Restraunt_Index',
                            'Normalised Restraunt Index':'Normalised_Restraunt_Index'},inplace=True)
data.columns

Index(['City', 'Price', 'Day', 'Room Type', 'Shared_Room', 'Private Room',
       'Person Capacity', 'Superhost', 'Multiple_Rooms', 'Business',
       'Cleanliness_Rating', 'Guest_Satisfaction', 'Bedrooms',
       'City_Center_km', 'Metro_Distance_km', 'Attraction_Index',
       'Normalised_Attraction_Index', 'Restraunt_Index',
       'Normalised_Restraunt_Index'],
      dtype='object')

In [4]:
city_g = data.groupby('City')['Guest_Satisfaction'].median().reset_index().rename(columns={'Guest_Satisfaction':'City_st'})
data = data.merge(city_g, on='City',how='left')

data['Entire_room_st'] = np.where(data['Room Type']=='Entire home/apt',1,0)
data['Weekend_st']= pd.get_dummies(data['Day'], drop_first=True)
data['Superhost_st'] = pd.get_dummies(data['Superhost'], drop_first=True)
data['Shared_Room'] = pd.get_dummies(data['Shared_Room'], drop_first=True)

data.head()



Unnamed: 0,City,Price,Day,Room Type,Shared_Room,Private Room,Person Capacity,Superhost,Multiple_Rooms,Business,...,City_Center_km,Metro_Distance_km,Attraction_Index,Normalised_Attraction_Index,Restraunt_Index,Normalised_Restraunt_Index,City_st,Entire_room_st,Weekend_st,Superhost_st
0,Amsterdam,194.033698,Weekday,Private room,0,True,2.0,False,1,0,...,5.022964,2.53938,78.690379,4.166708,98.253896,6.846473,96.0,0,0,0
1,Amsterdam,344.245776,Weekday,Private room,0,True,4.0,False,0,0,...,0.488389,0.239404,631.176378,33.421209,837.280757,58.342928,96.0,0,0,0
2,Amsterdam,264.101422,Weekday,Private room,0,True,2.0,False,0,1,...,5.748312,3.651621,75.275877,3.985908,95.386955,6.6467,96.0,0,0,0
3,Amsterdam,433.529398,Weekday,Private room,0,True,4.0,False,0,1,...,0.384862,0.439876,493.272534,26.119108,875.033098,60.973565,96.0,0,0,0
4,Amsterdam,485.552926,Weekday,Private room,0,True,2.0,True,0,0,...,0.544738,0.318693,552.830324,29.272733,815.30574,56.811677,96.0,0,0,1


### Regression

In [5]:
x_columns = ['City_st','Weekend_st','Entire_room_st','Shared_Room','Superhost_st','Multiple_Rooms',
             'Business','Cleanliness_Rating', 'Price', 'Bedrooms',
               'City_Center_km', 'Metro_Distance_km', 'Attraction_Index',
               'Normalised_Attraction_Index', 'Restraunt_Index', 'Normalised_Restraunt_Index']


formula = 'Guest_Satisfaction ~ 0 + ' + ' + '.join([ x for x in x_columns])
print(formula)

Y, X = dmatrices(formula,data,return_type="dataframe")

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)




Guest_Satisfaction ~ 0 + City_st + Weekend_st + Entire_room_st + Shared_Room + Superhost_st + Multiple_Rooms + Business + Cleanliness_Rating + Price + Bedrooms + City_Center_km + Metro_Distance_km + Attraction_Index + Normalised_Attraction_Index + Restraunt_Index + Normalised_Restraunt_Index


In [6]:
model = XGBRegressor()
model.fit(X_train,y_train)
model.feature_importances_

array([0.0227887 , 0.00306076, 0.01483831, 0.0057316 , 0.03439998,
       0.01561795, 0.04483685, 0.6864017 , 0.02026688, 0.01608679,
       0.02030095, 0.02259904, 0.02056484, 0.02827484, 0.02316709,
       0.02106375], dtype=float32)

In [7]:
y_pred = model.predict(X_test)
r2_score(y_test, y_pred)

0.6207695386109227

In [8]:
data_test = pd.DataFrame()
data_test['Guess_satisfaction_real'] = y_test
data_test['Guess_satisfaction_pred'] = y_pred
data_test['Guess_satisfaction_real_cat'] = pd.cut(data_test['Guess_satisfaction_real'],[19, 36, 52, 68, 84,101],labels=['0.EX_LOW','1.LOW','2.MEDIUM','3.HIGH','4.EX_HIGH']).astype(str)
data_test['Guess_satisfaction_pred_cat'] = pd.cut(data_test['Guess_satisfaction_pred'],[19, 36, 52, 68, 84,101],labels=['0.EX_LOW','1.LOW','2.MEDIUM','3.HIGH','4.EX_HIGH']).astype(str)
data_test.head()


Unnamed: 0,Guess_satisfaction_real,Guess_satisfaction_pred,Guess_satisfaction_real_cat,Guess_satisfaction_pred_cat
26813,60.0,73.033684,2.MEDIUM,3.HIGH
31082,98.0,98.550613,4.EX_HIGH,4.EX_HIGH
32921,90.0,89.216515,4.EX_HIGH,4.EX_HIGH
28620,95.0,96.204338,4.EX_HIGH,4.EX_HIGH
9211,100.0,95.022644,4.EX_HIGH,4.EX_HIGH


In [9]:
data_test.groupby('Guess_satisfaction_real_cat').count()


Unnamed: 0_level_0,Guess_satisfaction_real,Guess_satisfaction_pred,Guess_satisfaction_pred_cat
Guess_satisfaction_real_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.EX_LOW,20,20,20
1.LOW,29,29,29
2.MEDIUM,107,107,107
3.HIGH,729,729,729
4.EX_HIGH,7458,7458,7458


In [10]:
report = classification_report(data_test['Guess_satisfaction_real_cat'], data_test['Guess_satisfaction_pred_cat'],output_dict=True)
pd.DataFrame(report)

Unnamed: 0,0.EX_LOW,1.LOW,2.MEDIUM,3.HIGH,4.EX_HIGH,nan,accuracy,macro avg,weighted avg
precision,0.846154,0.642857,0.52381,0.579572,0.935024,0.0,0.91394,0.587903,0.897462
recall,0.55,0.310345,0.205607,0.334705,0.984044,0.0,0.91394,0.39745,0.91394
f1-score,0.666667,0.418605,0.295302,0.424348,0.958908,0.0,0.91394,0.460638,0.901109
support,20.0,29.0,107.0,729.0,7458.0,0.0,0.91394,8343.0,8343.0


In [11]:
confusion_matrix(data_test['Guess_satisfaction_real_cat'], data_test['Guess_satisfaction_pred_cat'], labels=['0.EX_LOW','1.LOW','2.MEDIUM','3.HIGH','4.EX_HIGH'])


array([[  11,    4,    1,    0,    2],
       [   2,    9,   10,    2,    6],
       [   0,    1,   22,   59,   25],
       [   0,    0,    8,  244,  477],
       [   0,    0,    1,  116, 7339]])

### Classifier

In [18]:
y_train_cat = pd.DataFrame()
y_train_cat['Guest_Satisfaction']= pd.cut(y_train['Guest_Satisfaction'],[19, 36, 52, 68, 84,101],labels=[0,1,2,3,4]).astype(int)
y_test_cat = pd.DataFrame()
y_test_cat['Guest_Satisfaction']= pd.cut(y_test['Guest_Satisfaction'],[19, 36, 52, 68, 84,101],labels=[0,1,2,3,4]).astype(int)


In [19]:
xgb = XGBClassifier(objective ='multi:softprob', num_class = 5, use_label_encoder = False)
xgb.fit(X_train, y_train_cat)
y_pred = xgb.predict(X_test)

print(classification_report(y_test_cat, y_pred))





              precision    recall  f1-score   support

           0       0.89      0.85      0.87        20
           1       0.94      0.55      0.70        29
           2       0.80      0.41      0.54       107
           3       0.67      0.38      0.48       729
           4       0.94      0.99      0.96      7458

    accuracy                           0.92      8343
   macro avg       0.85      0.64      0.71      8343
weighted avg       0.91      0.92      0.91      8343



In [32]:
y_pred =  xgb.predict(X_test)

resumen = pd.DataFrame()

report_prev = classification_report(y_test_cat,y_pred,output_dict=True)
report = pd.DataFrame(report_prev)


report['0.0'] = report['0']*sum(y_test_cat==0)[0]
report['1.0'] = report['1']*sum(y_test_cat==1)[0]
report['2.0'] = report['2']*sum(y_test_cat==2)[0]
report['3.0'] = report['3']*sum(y_test_cat==3)[0]
report['4.0'] = report['4']*sum(y_test_cat==4)[0]
nresumen = len(resumen)

resumen.loc[nresumen,'rec_bad'] = report.loc['recall'][['0.0', '1.0', '2.0']].sum()/sum(y_test_cat<=2)[0]
resumen.loc[nresumen,'rec_good'] = report.loc['recall'][['3.0', '4.0']].sum()/sum(y_test_cat>2)[0]
resumen.loc[nresumen,'rec_tot'] = report.loc['recall'][['0.0', '1.0', '2.0','3.0', '4.0']].sum()/len(y_test_cat)
resumen.loc[nresumen,'acc_bad'] = report.loc['precision'][['0.0', '1.0', '2.0']].sum()/sum(y_test_cat<=2)[0]
resumen.loc[nresumen,'acc_good'] = report.loc['precision'][['3.0', '4.0']].sum()/sum(y_test_cat>2)[0]
resumen.loc[nresumen,'acc_tot'] = report.loc['precision'][['0.0', '1.0', '2.0','3.0', '4.0']].sum()/len(y_test_cat)
resumen

Unnamed: 0,rec_bad,rec_good,rec_tot,acc_bad,acc_good,acc_tot
0,0.49359,0.93221,0.924008,0.83839,0.914299,0.91288
