In [1]:
# Importing required libraries:
import pandas as pd

In [3]:
df = pd.read_excel('../data/listings_v1.1.xlsx', skiprows=4)
df1 = df.dropna(subset=['review_scores_rating'])
df1 = df1.drop(['weekly_price', 'security_deposit', 'cleaning_fee', 'extra_people'], axis=1)
df2 = df[df['review_scores_rating'].isnull()]

In [4]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from collections import Counter

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yashnagogineni/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yashnagogineni/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Fill mean value:

lists = ['review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'reviews_per_month', 'host_response_rate']

df1 = df1.fillna({'review_scores_rating': 100})

for ele in lists:
    df1[ele].fillna(df1[ele].mean(), axis = 0, inplace = True)

In [6]:
# Change all the text to lower case
df1 = df1.apply(lambda x: x.astype(str).str.lower())

In [7]:
def tokenize(text):
    for cols in text:
        tokens = nltk.word_tokenize(df1[cols].str.cat(sep=' '))
        tokens = [w.lower() for w in tokens]
        
        # Remove stop words
        stop_words = set(stopwords.words('english'))
        additional = set(["''" , ',' ,'{' ,'}','.',':',';','(',')','[',']','!','?','``','--','&','@','#','$','%','^','*','_','+','-','/','|','~','`','"','\'','\\','1','2','3','4','5','6','7','8','9','0'])
        stop_words.update(additional)
        
        filtered_tokens = [word for word in tokens if not word.lower() in stop_words]
        # Extract keywords based on frequency
        word_freq = Counter(filtered_tokens)
        top10_keywords = word_freq.most_common(10)
        print(top10_keywords)
        return top10_keywords

tokens_house = ['space', 'description', 'neighborhood_overview']
tokens_trans = ['transit', 'access']
tokens_host = ['interaction','host_about']
tokens_amenties = ['amenities']

tokenize(tokens_house)
tokenize(tokens_trans)
tokenize(tokens_host)
tokenize(tokens_amenties)


[('room', 1708), ('boston', 1578), ('apartment', 1281), ('bed', 1274), ('kitchen', 1223), ('bedroom', 1147), ('living', 911), ('floor', 792), ('one', 772), ('two', 771)]
[('line', 1646), ('walk', 1628), ('boston', 1353), ('station', 1327), ('minutes', 1075), ('bus', 1000), ('minute', 924), ('nan', 858), ('street', 837), ('parking', 703)]
[('nan', 1025), ('available', 689), ('guests', 447), ('phone', 398), ('stay', 387), ('questions', 366), ('need', 309), ('help', 301), ('email', 291), ('happy', 241)]
[('internet', 4729), ('detector', 4297), ('tv', 3423), ('dryer', 3283), ('friendly', 2836), ('wireless', 2645), ('heating', 2617), ('kitchen', 2523), ('smoke', 2322), ('essentials', 2290)]


[('internet', 4729),
 ('detector', 4297),
 ('tv', 3423),
 ('dryer', 3283),
 ('friendly', 2836),
 ('wireless', 2645),
 ('heating', 2617),
 ('kitchen', 2523),
 ('smoke', 2322),
 ('essentials', 2290)]

In [8]:
# Create categorical variables for the top keywords
house_keywords = ['room', 'bedroom', 'bed', 'kitchen', 'bathroom', 'living', 'floor']
trans_keywords = ['bus', 'line', 'walk', 'station', 'minutes', 'minute', 'street','parking','downtown']
host_keywords = ['host', 'guests', 'help', 'available', 'phone', 'questions', 'email']
amenties_keywords = ['internet', 'detector', 'tv', 'dryer', 'wireless', 'heating', 'kitchen', 'smoke', 'essentials']

for keyword in amenties_keywords:
    df1[keyword] = df1['amenities'].str.contains(keyword).astype(int)

for keyword in host_keywords:
    for cols in tokens_host:
        df1[keyword] = df1[cols].str.contains(keyword).astype(int)

for keyword in house_keywords:
    for cols in tokens_house:
        df1[keyword] = df1[cols].str.contains(keyword).astype(int)

for keyword in trans_keywords:
    for cols in tokens_trans:
        df1[keyword] = df1[cols].str.contains(keyword).astype(int)

In [9]:
# Delete the columns in tokens_house, tokens_trans, tokens_host, tokens_amenties
columns_to_delete = tokens_house + tokens_trans + tokens_host + tokens_amenties

# drop the columns
df1 = df1.drop(columns_to_delete, axis=1)
df1 = df1.drop(['id'], axis=1)


In [10]:
# convert columns into numerical values and fill missing values with mean
obj_columns = ['host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_listings_count',
 'host_total_listings_count',
 'host_has_profile_pic',
 'host_identity_verified',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'square_feet',
 'price',
 'minimum_nights',
 'maximum_nights',
 'availability_30',
 'availability_60',
 'availability_90',
 'availability_365',
 'number_of_reviews',
 'reviews_per_month',
 'review_duration',
 'review_time',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'instant_bookable']

for col in obj_columns:
    if df1[col].dtype == 'object':
        df1[col] = pd.to_numeric(df[col], errors='coerce')

# Fill missing values with mean
for col in obj_columns:
    if df1[col].isnull().any():
        mean_val = df1[col].mean()
        df1[col].fillna(mean_val, inplace=True)
        


In [11]:
# convert object columns to categorical
obj_columns = df1.select_dtypes(include=['object']).columns

# one-hot encode object columns
df1 = pd.get_dummies(df1, columns=obj_columns)


In [13]:
# Conduct RFE using Linear Regression, Random Forest, SVM
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df1.drop('price', axis=1), df1['price'], test_size=0.2, random_state=42)

rfe_lr = RFE(LinearRegression(), n_features_to_select=20)
rfe_rf = RFE(RandomForestRegressor(), n_features_to_select=20)

rfe_lr.fit(X_train, y_train)
rfe_rf.fit(X_train, y_train)

Y_predict_lr = rfe_lr.predict(X_test)
Y_predict_rf = rfe_rf.predict(X_test)

score_lr = rfe_lr.score(X_test, y_test)
score_rf = rfe_rf.score(X_test, y_test)

print('Accuracy score for Linear Regression: ', score_lr)
print('Accuracy score for Random Forest: ', score_rf)


Accuracy score for Linear Regression:  0.01559609241445159
Accuracy score for Random Forest:  0.6558149533418951


In [14]:
# Get the features selected by RFE on Random Forest
feature_name = X_train.columns[rfe_rf.support_]
feature_importance = rfe_rf.estimator_.feature_importances_

features = pd.DataFrame({'Features': feature_name, 'Importance': feature_importance})
print(features.sort_values(by='Importance', ascending=False))

                     Features  Importance
19  room_type_entire home/apt    0.231273
6                    bedrooms    0.165596
5                   bathrooms    0.158315
16                review_time    0.041958
14          reviews_per_month    0.041690
12           availability_365    0.037391
15            review_duration    0.034252
10            availability_60    0.034128
8              minimum_nights    0.033233
7                        beds    0.029356
17       review_scores_rating    0.022839
13          number_of_reviews    0.022294
11            availability_90    0.021845
4                accommodates    0.021835
1        host_acceptance_rate    0.021772
18     review_scores_location    0.020001
9             availability_30    0.018233
0          host_response_rate    0.015885
2         host_listings_count    0.014264
3   host_total_listings_count    0.013839


In [20]:
set(features['Features'].values) - set([ 'review_duration', 'review_time', 
             'availability_30', 'availability_60', 'availability_90', 'availability_365',
             'reviews_per_month', 'review_scores_rating', 'review_scores_location',
             'host_response_rate', 'host_acceptance_rate'] + ['room_type_entire home/apt', 'bedrooms', 'beds', 'bathrooms', 
          'minimum_nights', 'accommodates'])

{'host_listings_count', 'host_total_listings_count', 'number_of_reviews'}

In [26]:
X_train[['host_listings_count', 'host_total_listings_count', 'number_of_reviews']].describe()

Unnamed: 0,host_listings_count,host_total_listings_count,number_of_reviews
count,2196.0,2196.0,2196.0
mean,44.595628,44.595628,24.188525
std,143.691049,143.691049,38.582102
min,1.0,1.0,1.0
25%,1.0,1.0,3.0
50%,2.0,2.0,10.0
75%,7.0,7.0,28.0
max,749.0,749.0,404.0


In [24]:
X_train

Unnamed: 0,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,host_has_profile_pic,host_identity_verified,accommodates,bathrooms,bedrooms,...,bed_type_pull-out sofa,bed_type_real bed,cancellation_policy_flexible,cancellation_policy_moderate,cancellation_policy_strict,cancellation_policy_super_strict_30,require_guest_profile_picture_0,require_guest_profile_picture_1,require_guest_phone_verification_0,require_guest_phone_verification_1
981,0.68,0.73,0,1,1,1,1,4,1.0,1.0,...,0,1,0,0,1,0,0,1,0,1
2631,1.00,0.75,0,1,1,1,1,8,1.0,3.0,...,0,1,0,1,0,0,0,1,0,1
611,0.96,0.68,0,313,313,1,1,3,1.0,1.0,...,0,1,0,1,0,0,0,1,1,0
1118,1.00,0.94,1,6,6,1,1,2,1.0,1.0,...,0,1,0,0,1,0,1,0,1,0
2875,1.00,0.98,0,4,4,1,1,4,1.0,1.0,...,0,1,0,0,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2032,0.97,1.00,0,4,4,1,1,6,1.0,2.0,...,0,1,0,1,0,0,0,1,0,1
1364,0.92,1.00,0,1,1,1,1,2,1.0,1.0,...,0,1,0,0,1,0,0,1,0,1
1413,1.00,0.87,1,12,12,1,0,4,1.0,1.0,...,0,1,0,0,1,0,0,1,0,1
1605,1.00,0.94,0,7,7,1,1,2,2.0,1.0,...,0,1,0,0,1,0,0,1,0,1


In [38]:
s = {'room_type':1, \
   'bedrooms': 1,\
   'beds': 1, \
   'bathrooms': 1,\
   'minimum_nights': 1,\
   'accommodates': 1}

pd.DataFrame(s, index=[0]).astype('int')

Unnamed: 0,room_type,bedrooms,beds,bathrooms,minimum_nights,accommodates
0,1,1,1,1,1,1
