In [1]:
# Import the pandas package, then use the "read_csv" function to read
# the labeled training data
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
%config InlineBackend.figure_format = 'png' #set 'png' here when working on notebook
warnings.filterwarnings('ignore') 
train = pd.read_csv(r"C:\Users\piush\Desktop\Dataset\HostelWorld\train_review_data.csv")

In [2]:
print ("\n\n---------------------")
print ("TRAIN SET INFORMATION")
print ("---------------------")
print ("Shape of training set:", train.shape, "\n")
print ("Column Headers:", list(train.columns.values), "\n")
print (train.dtypes)



---------------------
TRAIN SET INFORMATION
---------------------
Shape of training set: (901701, 7) 

Column Headers: ['customer_id', 'review_id', 'review_score', 'review_text', 'HostelNumber', 'review_date', 'review_language'] 

customer_id          int64
review_id            int64
review_score         int64
review_text         object
HostelNumber       float64
review_date         object
review_language     object
dtype: object


In [3]:
import re
missing_values = []
nonumeric_values = []

print ("TRAINING SET INFORMATION")
print ("========================\n")

for column in train:
    # Find all the unique feature values
    uniq = train[column].unique()
    print ("'{}' has {} unique values" .format(column,uniq.size))
    if (uniq.size > 10):
        print("~~Listing up to 10 unique values~~")
    print (uniq[0:10])
    print ("\n-----------------------------------------------------------------------\n")
    
    # Find features with missing values
    if (True in pd.isnull(uniq)):
        s = "{} has {} missing" .format(column, pd.isnull(train[column]).sum())
        missing_values.append(s)
    
    # Find features with non-numeric values
    for i in range (1, np.prod(uniq.shape)):
        if (re.match('nan', str(uniq[i]))):
            break
        if not (re.search('(^\d+\.?\d*$)|(^\d*\.?\d+$)', str(uniq[i]))):
            nonumeric_values.append(column)
            break
  
print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
print ("Features with missing values:\n{}\n\n" .format(missing_values))
print ("Features with non-numeric values:\n{}" .format(nonumeric_values))
print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")

TRAINING SET INFORMATION

'customer_id' has 192034 unique values
~~Listing up to 10 unique values~~
[  309693   459093   544693  1417693  1489693  5057093  5626893  6879893
 10077293 12690493]

-----------------------------------------------------------------------

'review_id' has 901701 unique values
~~Listing up to 10 unique values~~
[6085536 6244624 6244607 6895146 5238648 5420976 5420965 5920613 5907925
 6154930]

-----------------------------------------------------------------------

'review_score' has 36 unique values
~~Listing up to 10 unique values~~
[ 86  97  94  91  89 100  77  60  74  66]

-----------------------------------------------------------------------

'review_text' has 887416 unique values
~~Listing up to 10 unique values~~
[ 'the space in the rooms is not enough specially when 12 people are together breakfast was simple and didn t offer fruit or alternatives for those who like balanced food i was in edinburgh backpakers and everything was perfect for less money 

In [4]:
#remove NaN values from train for Hostel Number, review_language, review_date
train = train[pd.notnull(train['HostelNumber'])]

In [6]:
##remove NaN values from train for review_text
train = train[pd.notnull(train['review_text'])]

In [8]:
import re
missing_values = []
nonumeric_values = []

print ("TRAINING SET INFORMATION")
print ("========================\n")

for column in train:
    # Find all the unique feature values
    uniq = train[column].unique()
    print ("'{}' has {} unique values" .format(column,uniq.size))
    if (uniq.size > 10):
        print("~~Listing up to 10 unique values~~")
    print (uniq[0:10])
    print ("\n-----------------------------------------------------------------------\n")
    
    # Find features with missing values
    if (True in pd.isnull(uniq)):
        s = "{} has {} missing" .format(column, pd.isnull(train[column]).sum())
        missing_values.append(s)
    
    # Find features with non-numeric values
    for i in range (1, np.prod(uniq.shape)):
        if (re.match('nan', str(uniq[i]))):
            break
        if not (re.search('(^\d+\.?\d*$)|(^\d*\.?\d+$)', str(uniq[i]))):
            nonumeric_values.append(column)
            break
  
print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
print ("Features with missing values:\n{}\n\n" .format(missing_values))
print ("Features with non-numeric values:\n{}" .format(nonumeric_values))
print ("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")

TRAINING SET INFORMATION

'customer_id' has 192016 unique values
~~Listing up to 10 unique values~~
[  309693   459093   544693  1417693  1489693  5057093  5626893  6879893
 10077293 12690493]

-----------------------------------------------------------------------

'review_id' has 900108 unique values
~~Listing up to 10 unique values~~
[6085536 6244624 6244607 6895146 5238648 5420976 5420965 5920613 5907925
 6154930]

-----------------------------------------------------------------------

'review_score' has 32 unique values
~~Listing up to 10 unique values~~
[ 86  97  94  91  89 100  77  60  74  66]

-----------------------------------------------------------------------

'review_text' has 887393 unique values
~~Listing up to 10 unique values~~
[ 'the space in the rooms is not enough specially when 12 people are together breakfast was simple and didn t offer fruit or alternatives for those who like balanced food i was in edinburgh backpakers and everything was perfect for less money 

In [9]:
train_text = train['review_text']

In [10]:
print("Review Text size (char): {}".format(len(train_text)))

Review Text size (char): 900108


In [21]:
train_text.head(5)


0    the space in the rooms is not enough specially...
1                      the wi fi doesn t work properly
2    beddings and pillows need to be replaced for n...
3    i had a few problems with the free unlimited w...
4    this is a very pleasant hostel the staff were ...
Name: review_text, dtype: object

In [16]:
#Tokenisation
import nltk

train['train_text'] = train.apply(lambda row: nltk.word_tokenize(row['review_text']), axis=1)

In [17]:
train.head()

Unnamed: 0,customer_id,review_id,review_score,review_text,HostelNumber,review_date,review_language,train_text
0,309693,6085536,86,the space in the rooms is not enough specially...,4815.0,2014-01-10 15:34:36,English,"[the, space, in, the, rooms, is, not, enough, ..."
1,309693,6244624,97,the wi fi doesn t work properly,34160.0,2014-04-21 20:32:25,English,"[the, wi, fi, doesn, t, work, properly]"
2,309693,6244607,94,beddings and pillows need to be replaced for n...,65881.0,2014-04-21 20:18:44,English,"[beddings, and, pillows, need, to, be, replace..."
3,459093,6895146,91,i had a few problems with the free unlimited w...,36020.0,2015-01-09 07:26:36,English,"[i, had, a, few, problems, with, the, free, un..."
4,459093,5238648,89,this is a very pleasant hostel the staff were ...,12168.0,2013-01-18 03:52:21,English,"[this, is, a, very, pleasant, hostel, the, sta..."


In [19]:
#length of each text
train['text_length'] = train.apply(lambda row: len(row['train_text']), axis=1)

In [23]:
train.head(2)

Unnamed: 0,customer_id,review_id,review_score,review_text,HostelNumber,review_date,review_language,train_text,text_length
0,309693,6085536,86,the space in the rooms is not enough specially...,4815.0,2014-01-10 15:34:36,English,"[the, space, in, the, rooms, is, not, enough, ...",63
1,309693,6244624,97,the wi fi doesn t work properly,34160.0,2014-04-21 20:32:25,English,"[the, wi, fi, doesn, t, work, properly]",7


In [22]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [24]:
train['train_text_stop'] = train['train_text'].apply(lambda x: [item for item in x if item not in stop])

In [36]:
#Stemming is the process of reducing a word to its base/root form, called stem
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

train['stemmed'] = train["train_text_stop"].apply(lambda x: [stemmer.stem(y) for y in x])

In [37]:
#n-grams

train['ngram'] = train['stemmed'].apply(lambda x : [])


Unnamed: 0,customer_id,review_id,review_score,review_text,HostelNumber,review_date,review_language,train_text,text_length,train_text_stop,stemmed
0,309693,6085536,86,the space in the rooms is not enough specially...,4815.0,2014-01-10 15:34:36,English,"[the, space, in, the, rooms, is, not, enough, ...",63,"[space, rooms, enough, specially, 12, people, ...","[space, room, enough, special, 12, peopl, toge..."
1,309693,6244624,97,the wi fi doesn t work properly,34160.0,2014-04-21 20:32:25,English,"[the, wi, fi, doesn, t, work, properly]",7,"[wi, fi, work, properly]","[wi, fi, work, properli]"
2,309693,6244607,94,beddings and pillows need to be replaced for n...,65881.0,2014-04-21 20:18:44,English,"[beddings, and, pillows, need, to, be, replace...",33,"[beddings, pillows, need, replaced, new, ones,...","[bed, pillow, need, replac, new, one, necessar..."
3,459093,6895146,91,i had a few problems with the free unlimited w...,36020.0,2015-01-09 07:26:36,English,"[i, had, a, few, problems, with, the, free, un...",59,"[problems, free, unlimited, wifi, slow, kept, ...","[problem, free, unlimit, wifi, slow, kept, tel..."
4,459093,5238648,89,this is a very pleasant hostel the staff were ...,12168.0,2013-01-18 03:52:21,English,"[this, is, a, very, pleasant, hostel, the, sta...",47,"[pleasant, hostel, staff, nice, hostel, clean,...","[pleasant, hostel, staff, nice, hostel, clean,..."
