In [1]:
# Import the pandas package, then use the "read_csv" function to read
# the labeled training data
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
%config InlineBackend.figure_format = 'png' #set 'png' here when working on notebook
warnings.filterwarnings('ignore') 

In [2]:
train = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\HostelWorld\train_review_data.csv")
customer = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\HostelWorld\customer_data.csv")
hostel = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\HostelWorld\hostel_data.csv")
review = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\HostelWorld\Review_data.csv",header = None)
test = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\HostelWorld\test_review_data.csv")
user = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\HostelWorld\user_data.csv",header = None)

In [3]:
train.head(2)

Unnamed: 0,customer_id,review_id,review_score,review_text,HostelNumber,review_date,review_language
0,309693,6085536,86,the space in the rooms is not enough specially...,4815.0,2014-01-10 15:34:36,English
1,309693,6244624,97,the wi fi doesn t work properly,34160.0,2014-04-21 20:32:25,English


In [4]:
customer.head(2)

Unnamed: 0,nationality,age,gender,customer_id
0,France,,Male,71775112
1,,,Female,17236506


In [6]:
hostel.columns = ['HostelNumber', 'num_reviews','description','score','features','policies']
hostel.head(2)


Unnamed: 0,HostelNumber,num_reviews,description,score,features,policies
0,17,2289,Home Youth Hostel Valencia by Feetup Hostels o...,92,24 Hour Reception|Adaptors|Air Conditioning|Bi...,Credit Cards Accepted|No Curfew|Non Smoking
1,19,153,Hostal Marlasca is just in the heart of Madrid...,90,24 Hour Reception|Air Conditioning|Airport Tra...,Credit Cards Accepted|No Curfew


In [7]:
review.head(2)

Unnamed: 0,0,1,2,3,4,5
0,10983165,4344334,there is not 24 reception staff was not always...,28642.0,2011-12-26 19:46:14,English
1,11138493,4350334,i booked a private double room but i totally d...,36596.0,2011-12-31 05:51:19,English


In [8]:
test.head(2)

Unnamed: 0,customer_id,review_id,review_text,HostelNumber,review_date,review_language
0,331154,5642315,nice hostel well priced and very well located ...,2332,2013-07-14 13:16:38,English
1,1145354,8537815,i had a wonderful stay at this hostel the staf...,79412,2016-01-16 03:54:46,English


In [9]:
user.head(2)

Unnamed: 0,0,1,2,3
0,18131165,,,Male
1,15169619,,,Male


In [10]:
df1 = pd.merge(train, customer, on='customer_id')


In [11]:
df2 = pd.merge(df1, hostel, on='HostelNumber')

In [12]:
df3 = df2[['customer_id','review_id','review_score','HostelNumber','review_date','review_language','nationality','age','gender','num_reviews','score']]

In [13]:
test1 = test[['customer_id','review_id','HostelNumber','review_date','review_language']]

In [13]:
print ("\n\n---------------------")
print ("TRAIN SET INFORMATION")
print ("---------------------")
print ("Shape of training set:", df3.shape, "\n")
print ("Column Headers:", list(df3.columns.values), "\n")
print (df3.dtypes)



---------------------
TRAIN SET INFORMATION
---------------------
Shape of training set: (780462, 11) 

Column Headers: ['customer_id', 'review_id', 'review_score', 'HostelNumber', 'review_date', 'review_language', 'nationality', 'age', 'gender', 'num_reviews', 'score'] 

customer_id          int64
review_id            int64
review_score         int64
HostelNumber       float64
review_date         object
review_language     object
nationality         object
age                float64
gender              object
num_reviews          int64
score                int64
dtype: object


In [14]:
import re
missing_values = []
nonumeric_values = []

print ("TRAINING SET INFORMATION")
print ("========================\n")

for column in df3:
    # Find all the unique feature values
    uniq = df3[column].unique()
    print ("'{}' has {} unique values" .format(column,uniq.size))
    if (uniq.size > 10):
        print("~~Listing up to 10 unique values~~")
    print (uniq[0:10])
    print ("\n-----------------------------------------------------------------------\n")
    
    # Find features with missing values
    if (True in pd.isnull(uniq)):
        s = "{} has {} missing" .format(column, pd.isnull(df3[column]).sum())
        missing_values.append(s)
    
    # Find features with non-numeric values
    for i in range (1, np.prod(uniq.shape)):
        if (re.match('nan', str(uniq[i]))):
            break
        if not (re.search('(^\d+\.?\d*$)|(^\d*\.?\d+$)', str(uniq[i]))):
            nonumeric_values.append(column)
            break
  
print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
print ("Features with missing values:\n{}\n\n" .format(missing_values))
print ("Features with non-numeric values:\n{}" .format(nonumeric_values))
print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")

TRAINING SET INFORMATION

'customer_id' has 189618 unique values
~~Listing up to 10 unique values~~
[   309693 280960893 863260093 870460375 995749975 254134337 580047937
 933369737 271142761 505685828]

-----------------------------------------------------------------------

'review_id' has 780462 unique values
~~Listing up to 10 unique values~~
[6085536 5054486 7186405 5517212 4903533 5158940 7395438 5529226 6424458
 6677691]

-----------------------------------------------------------------------

'review_score' has 36 unique values
~~Listing up to 10 unique values~~
[ 86  71  80  91  66  83  74 100  77  94]

-----------------------------------------------------------------------

'HostelNumber' has 18788 unique values
~~Listing up to 10 unique values~~
[  4815.  34160.  65881.  36020.  12168.  14275.  38731.  51787.  45631.
  47916.]

-----------------------------------------------------------------------

'review_date' has 777640 unique values
~~Listing up to 10 unique values~~
['

In [15]:
print ("\n\n---------------------")
print ("TEST SET INFORMATION")
print ("---------------------")
print ("Shape of training set:", test1.shape, "\n")
print ("Column Headers:", list(test1.columns.values), "\n")
print (test1.dtypes)



---------------------
TEST SET INFORMATION
---------------------
Shape of training set: (192035, 5) 

Column Headers: ['customer_id', 'review_id', 'HostelNumber', 'review_date', 'review_language'] 

customer_id         int64
review_id           int64
HostelNumber        int64
review_date        object
review_language    object
dtype: object


In [16]:
import re
missing_values = []
nonumeric_values = []

print ("TEST SET INFORMATION")
print ("========================\n")

for column in test1:
    # Find all the unique feature values
    uniq = test1[column].unique()
    print ("'{}' has {} unique values" .format(column,uniq.size))
    if (uniq.size > 10):
        print("~~Listing up to 10 unique values~~")
    print (uniq[0:10])
    print ("\n-----------------------------------------------------------------------\n")
    
    # Find features with missing values
    if (True in pd.isnull(uniq)):
        s = "{} has {} missing" .format(column, pd.isnull(test1[column]).sum())
        missing_values.append(s)
    
    # Find features with non-numeric values
    for i in range (1, np.prod(uniq.shape)):
        if (re.match('nan', str(uniq[i]))):
            break
        if not (re.search('(^\d+\.?\d*$)|(^\d*\.?\d+$)', str(uniq[i]))):
            nonumeric_values.append(column)
            break
  
print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
print ("Features with missing values:\n{}\n\n" .format(missing_values))
print ("Features with non-numeric values:\n{}" .format(nonumeric_values))
print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")

TEST SET INFORMATION

'customer_id' has 192034 unique values
~~Listing up to 10 unique values~~
[ 331154 1145354 1625354 1803354 2222954 3635954 4142554 5773954 5967154
 7678154]

-----------------------------------------------------------------------

'review_id' has 192035 unique values
~~Listing up to 10 unique values~~
[5642315 8537815 7330678 8577749 7400453 6191010 7716565 7841934 7770979
 5140130]

-----------------------------------------------------------------------

'HostelNumber' has 19259 unique values
~~Listing up to 10 unique values~~
[  2332  79412    641 263648   6494  18575  25012  14154  28983   2165]

-----------------------------------------------------------------------

'review_date' has 191831 unique values
~~Listing up to 10 unique values~~
['2013-07-14 13:16:38' '2016-01-16 03:54:46' '2015-07-07 02:43:36'
 '2016-01-31 17:18:44' '2015-07-27 09:53:37' '2014-03-22 05:45:06'
 '2015-11-05 05:57:30' '2016-01-02 15:12:26' '2015-11-30 10:08:46'
 '2012-11-27 14:23:49']

In [14]:
df4 = df3.append(test1,ignore_index=True)

In [15]:
df4.loc[:, "age"] = df4.loc[:, "age"].fillna(df4['age'].median())
df4.loc[:, "gender"] = df4.loc[:, "gender"].fillna("None")
df4.loc[:, "nationality"] = df4.loc[:, "nationality"].fillna("None")

In [16]:
df4['HostelNumber'] = df4['HostelNumber'].astype(int)

In [17]:
df4.shape

(972497, 11)

In [18]:
df5 = df4[['HostelNumber', 'age', 'customer_id', 'gender', 'nationality',
       'num_reviews', 'review_id', 'review_language',
       'review_score', 'score']]

In [22]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
df5['gender'] = class_le.fit_transform(df5['gender'].values)
df5['nationality'] = class_le.fit_transform(df5['nationality'].values)
df5['review_language'] = class_le.fit_transform(df5['review_language'].values)

In [25]:
target = df5['review_score']
df6 = df5.drop('review_score', axis=1,inplace= True)

df6

KeyError: 'review_score'

In [24]:
X = df6[:df3.shape[0]]
test2 = df6[df3.shape[0]:]

y = target[:df3.shape[0]]


TypeError: 'NoneType' object is not subscriptable

In [71]:
print (str(len(X))+" rows for training set")
print (str(len(test2))+" rows for test set")
print (str(len(y))+" rows for target set")

780462 rows for training set
192035 rows for test set
780462 rows for target set


In [76]:
from sklearn import cross_validation
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

num_folds = 10
num_instances = len(X)
seed = 7
kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
model = LinearRegression()
scoring = 'r2'
results = cross_validation.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
print(results.mean(), results.std())

0.0408948600967 0.0640617015734


In [None]:
from sklearn.ensemble import RandomForestRegressor
clf_random = RandomForestRegressor(n_estimators=10, n_jobs=-1)

clf_random.fit(X, y)
y_pred = clf_random.predict(test2)


In [None]:
solution = pd.DataFrame({"id":df_test.Id, "SalePrice":y_pred}, columns=['id', 'SalePrice'])
solution.to_csv("random_regressor.csv", index = False)