In [1]:
# Import the pandas package, then use the "read_csv" function to read
# the labeled training data
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
%config InlineBackend.figure_format = 'png' #set 'png' here when working on notebook
warnings.filterwarnings('ignore') 

In [2]:
train = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\HostelWorld\train_review_data.csv")
customer = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\HostelWorld\customer_data.csv")
hostel = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\HostelWorld\hostel_data.csv")
review = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\HostelWorld\Review_data.csv",header = None)
test = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\HostelWorld\test_review_data.csv")
user = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\HostelWorld\user_data.csv",header = None)

In [3]:
train.head(2)

Unnamed: 0,customer_id,review_id,review_score,review_text,HostelNumber,review_date,review_language
0,309693,6085536,86,the space in the rooms is not enough specially...,4815.0,2014-01-10 15:34:36,English
1,309693,6244624,97,the wi fi doesn t work properly,34160.0,2014-04-21 20:32:25,English


In [4]:
customer.head(2)

Unnamed: 0,nationality,age,gender,customer_id
0,France,,Male,71775112
1,,,Female,17236506


In [5]:
hostel.columns = ['HostelNumber', 'num_reviews','description','score','features','policies']
hostel.head(2)


Unnamed: 0,HostelNumber,num_reviews,description,score,features,policies
0,17,2289,Home Youth Hostel Valencia by Feetup Hostels o...,92,24 Hour Reception|Adaptors|Air Conditioning|Bi...,Credit Cards Accepted|No Curfew|Non Smoking
1,19,153,Hostal Marlasca is just in the heart of Madrid...,90,24 Hour Reception|Air Conditioning|Airport Tra...,Credit Cards Accepted|No Curfew


In [6]:
review.head(2)

Unnamed: 0,0,1,2,3,4,5
0,10983165,4344334,there is not 24 reception staff was not always...,28642.0,2011-12-26 19:46:14,English
1,11138493,4350334,i booked a private double room but i totally d...,36596.0,2011-12-31 05:51:19,English


In [7]:
test.head(2)

Unnamed: 0,customer_id,review_id,review_text,HostelNumber,review_date,review_language
0,331154,5642315,nice hostel well priced and very well located ...,2332,2013-07-14 13:16:38,English
1,1145354,8537815,i had a wonderful stay at this hostel the staf...,79412,2016-01-16 03:54:46,English


In [8]:
user.head(2)

Unnamed: 0,0,1,2,3
0,18131165,,,Male
1,15169619,,,Male


In [10]:
df1 = pd.merge(train, customer, on='customer_id')
df1.head(2)

Unnamed: 0,customer_id,review_id,review_score,review_text,HostelNumber,review_date,review_language,nationality,age,gender
0,309693,6085536,86,the space in the rooms is not enough specially...,4815.0,2014-01-10 15:34:36,English,,,Female
1,309693,6244624,97,the wi fi doesn t work properly,34160.0,2014-04-21 20:32:25,English,,,Female


In [12]:
df2 = pd.merge(df1, hostel, on='HostelNumber')
df2.head(2)

Unnamed: 0,customer_id,review_id,review_score,review_text,HostelNumber,review_date,review_language,nationality,age,gender,num_reviews,description,score,features,policies
0,309693,6085536,86,the space in the rooms is not enough specially...,4815.0,2014-01-10 15:34:36,English,,,Female,2065,Travelling alone? Do you like to meet other pe...,75,24 Hour Security|Bar|Book Exchange|Free Breakf...,Age Restriction|Credit Cards Accepted|No Curfe...
1,280960893,5054486,71,good location for train and city centre could ...,4815.0,2012-10-19 13:19:47,English,,,Female,2065,Travelling alone? Do you like to meet other pe...,75,24 Hour Security|Bar|Book Exchange|Free Breakf...,Age Restriction|Credit Cards Accepted|No Curfe...


In [13]:
df3 = df2[['customer_id','review_id','review_score','HostelNumber','review_date','review_language','nationality','age','gender','num_reviews','score']]

In [14]:
df3.head(2)

Unnamed: 0,customer_id,review_id,review_score,HostelNumber,review_date,review_language,nationality,age,gender,num_reviews,score
0,309693,6085536,86,4815.0,2014-01-10 15:34:36,English,,,Female,2065,75
1,280960893,5054486,71,4815.0,2012-10-19 13:19:47,English,,,Female,2065,75


In [15]:
test1 = test[['customer_id','review_id','HostelNumber','review_date','review_language']]

In [13]:
print ("\n\n---------------------")
print ("TRAIN SET INFORMATION")
print ("---------------------")
print ("Shape of training set:", df3.shape, "\n")
print ("Column Headers:", list(df3.columns.values), "\n")
print (df3.dtypes)



---------------------
TRAIN SET INFORMATION
---------------------
Shape of training set: (780462, 11) 

Column Headers: ['customer_id', 'review_id', 'review_score', 'HostelNumber', 'review_date', 'review_language', 'nationality', 'age', 'gender', 'num_reviews', 'score'] 

customer_id          int64
review_id            int64
review_score         int64
HostelNumber       float64
review_date         object
review_language     object
nationality         object
age                float64
gender              object
num_reviews          int64
score                int64
dtype: object


In [14]:
import re
missing_values = []
nonumeric_values = []

print ("TRAINING SET INFORMATION")
print ("========================\n")

for column in df3:
    # Find all the unique feature values
    uniq = df3[column].unique()
    print ("'{}' has {} unique values" .format(column,uniq.size))
    if (uniq.size > 10):
        print("~~Listing up to 10 unique values~~")
    print (uniq[0:10])
    print ("\n-----------------------------------------------------------------------\n")
    
    # Find features with missing values
    if (True in pd.isnull(uniq)):
        s = "{} has {} missing" .format(column, pd.isnull(df3[column]).sum())
        missing_values.append(s)
    
    # Find features with non-numeric values
    for i in range (1, np.prod(uniq.shape)):
        if (re.match('nan', str(uniq[i]))):
            break
        if not (re.search('(^\d+\.?\d*$)|(^\d*\.?\d+$)', str(uniq[i]))):
            nonumeric_values.append(column)
            break
  
print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
print ("Features with missing values:\n{}\n\n" .format(missing_values))
print ("Features with non-numeric values:\n{}" .format(nonumeric_values))
print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")

TRAINING SET INFORMATION

'customer_id' has 189618 unique values
~~Listing up to 10 unique values~~
[   309693 280960893 863260093 870460375 995749975 254134337 580047937
 933369737 271142761 505685828]

-----------------------------------------------------------------------

'review_id' has 780462 unique values
~~Listing up to 10 unique values~~
[6085536 5054486 7186405 5517212 4903533 5158940 7395438 5529226 6424458
 6677691]

-----------------------------------------------------------------------

'review_score' has 36 unique values
~~Listing up to 10 unique values~~
[ 86  71  80  91  66  83  74 100  77  94]

-----------------------------------------------------------------------

'HostelNumber' has 18788 unique values
~~Listing up to 10 unique values~~
[  4815.  34160.  65881.  36020.  12168.  14275.  38731.  51787.  45631.
  47916.]

-----------------------------------------------------------------------

'review_date' has 777640 unique values
~~Listing up to 10 unique values~~
['

In [15]:
print ("\n\n---------------------")
print ("TEST SET INFORMATION")
print ("---------------------")
print ("Shape of training set:", test1.shape, "\n")
print ("Column Headers:", list(test1.columns.values), "\n")
print (test1.dtypes)



---------------------
TEST SET INFORMATION
---------------------
Shape of training set: (192035, 5) 

Column Headers: ['customer_id', 'review_id', 'HostelNumber', 'review_date', 'review_language'] 

customer_id         int64
review_id           int64
HostelNumber        int64
review_date        object
review_language    object
dtype: object


In [63]:
import re
missing_values = []
nonumeric_values = []

print ("TEST SET INFORMATION")
print ("========================\n")

for column in test_hostel:
    # Find all the unique feature values
    uniq = test_hostel[column].unique()
    print ("'{}' has {} unique values" .format(column,uniq.size))
    if (uniq.size > 10):
        print("~~Listing up to 10 unique values~~")
    print (uniq[0:10])
    print ("\n-----------------------------------------------------------------------\n")
    
    # Find features with missing values
    if (True in pd.isnull(uniq)):
        s = "{} has {} missing" .format(column, pd.isnull(test_hostel[column]).sum())
        missing_values.append(s)
    
    # Find features with non-numeric values
    for i in range (1, np.prod(uniq.shape)):
        if (re.match('nan', str(uniq[i]))):
            break
        if not (re.search('(^\d+\.?\d*$)|(^\d*\.?\d+$)', str(uniq[i]))):
            nonumeric_values.append(column)
            break
  
print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
print ("Features with missing values:\n{}\n\n" .format(missing_values))
print ("Features with non-numeric values:\n{}" .format(nonumeric_values))
print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")

TEST SET INFORMATION

'age' has 1 unique values
[ 29.]

-----------------------------------------------------------------------

'gender' has 1 unique values
[2]

-----------------------------------------------------------------------

'nationality' has 1 unique values
[168]

-----------------------------------------------------------------------

'num_reviews' has 1 unique values
[ 653.]

-----------------------------------------------------------------------

'review_language' has 21 unique values
~~Listing up to 10 unique values~~
[ 5 11  0 18  4  7 12 10  8 17]

-----------------------------------------------------------------------

'review_score' has 1 unique values
[ nan]

-----------------------------------------------------------------------

'score' has 1 unique values
[ 86.]

-----------------------------------------------------------------------


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Features with missing values:
['review_sc

In [16]:
df4 = df3.append(test1,ignore_index=True)

#### Imputing missing values

In [53]:
df4.loc[:, "age"] = df4.loc[:, "age"].fillna(df4['age'].median())
df4.loc[:, "gender"] = df4.loc[:, "gender"].fillna("None")
df4.loc[:, "nationality"] = df4.loc[:, "nationality"].fillna("None")

In [54]:
df4.loc[:, "num_reviews"] = df4.loc[:, "num_reviews"].fillna(df4['num_reviews'].median())
df4.loc[:, "score"] = df4.loc[:, "score"].fillna(df4['score'].median())

In [55]:
df4['HostelNumber'] = df4['HostelNumber'].astype(int)

In [56]:
df4.shape

(972497, 11)

In [30]:
#removed score
df5 = df4[['HostelNumber', 'age', 'customer_id', 'gender', 'nationality',
       'num_reviews', 'review_id', 'review_language',
       'review_score','score']]

In [31]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
df5['gender'] = class_le.fit_transform(df5['gender'].values)
df5['nationality'] = class_le.fit_transform(df5['nationality'].values)
df5['review_language'] = class_le.fit_transform(df5['review_language'].values)

In [32]:
target = df5['review_score']
df6 = df5.drop(['review_score','customer_id','review_id'], axis=1)



In [33]:
X = df6[:df3.shape[0]]
test2 = df6[df3.shape[0]:]

y = target[:df3.shape[0]]


In [34]:
from sklearn.preprocessing import StandardScaler

# Standardize numerical features
stdSc = StandardScaler()
X = stdSc.fit_transform(X)
test2 = stdSc.fit_transform(test2)


In [35]:
print (str(len(X))+" rows for training set")
print (str(len(test2))+" rows for test set")
print (str(len(y))+" rows for target set")

780462 rows for training set
192035 rows for test set
780462 rows for target set


In [36]:
# sk learn import 
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.grid_search import GridSearchCV
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,AdaBoostRegressor
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb

In [39]:
model = xgb.XGBRegressor()
model.fit(X, y)
predictions = model.predict(test2).astype(int)

print ('R-squared: %.4f' % model.score(X, y))

R-squared: 0.0394


array([81, 88, 81, ..., 91, 84, 89])

In [41]:
solution = pd.DataFrame({"customer_id":test1.customer_id,"review_id":test1.review_id,"HostelNumber":test1.HostelNumber, "predicted_score":predictions}, columns=['customer_id','review_id','HostelNumber','predicted_score'])
solution.to_csv("submission_hostelWorld_predicted scores_PiushVaish_4.csv", index = False)

#### Hostel Recommendation

In [100]:
df7 = df4[['customer_id','HostelNumber','score']]


In [101]:
df7.head()

Unnamed: 0,customer_id,HostelNumber,score
0,309693,4815,75.0
1,280960893,4815,75.0
2,863260093,4815,75.0
3,870460375,4815,75.0
4,995749975,4815,75.0


In [102]:
X_hostel = df7[:df3.shape[0]]
test_hostel = df7[df3.shape[0]:]

In [103]:
print (str(len(X_hostel))+" rows for training set")
print (str(len(test_hostel))+" rows for test set")

780462 rows for training set
192035 rows for test set


In [104]:
test_hostel.rename(columns={'score': 'predicted_score', 'HostelNumber': 'recommended_HostelNumber'}, inplace=True)

In [106]:
test_hostel.to_csv("submission_hostelWorld_hostel_recommendation_PiushVaish_1.csv", index = False)