In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import re
import nltk
from nltk.tokenize import word_tokenize
from re import sub
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# nltk.download()

In [None]:
cwd = os.getcwd()
source_path = cwd + '/source//'

In [None]:
# load data
calendar = pd.read_csv(source_path + '/calendar.csv')
df_list = pd.read_csv(source_path + '/listings.csv')
listings_detail = pd.read_csv(source_path + '/listings_detail.csv')
#neighbourhoods = pd.read_csv(cwd + '/neighbourhoods.csv')
reviews = pd.read_csv(source_path + '/reviews_detail.csv')

In [None]:
# calendar.head()

In [None]:
df_list['neighbourhood'].unique()

In [None]:
df_list.head()

In [None]:
listings_detail.head()

In [None]:
list(listings_detail)

In [None]:
# neighbourhoods

In [None]:
reviews.head()

In [None]:
df_list['neighbourhood'].value_counts(normalize=True)

In [None]:
df_list['neighbourhood'].value_counts()

## Is score important?

In [None]:
# first step: clean data and deal with Categorical Variables

In [None]:
response = 'availability_30'
df = listings_detail[['neighbourhood_cleansed', 'room_type', 'price',
                      'review_scores_rating', 'reviews_per_month',
                      'number_of_reviews_ltm', 'property_type',
                        response]]
# df = listings_detail[['room_type', 'price',
#                         'review_scores_rating', 'reviews_per_month',
#                         'availability_365']]

In [None]:
df.info()

In [None]:
df['availability_score'] = np.where(df[response]==0, 30, 30-df[response])
df.drop([response], axis=1, inplace=True)
response = 'availability_score'

In [None]:
# df.head()

In [None]:
def currency_to_float(row):
    '''
    INPUT:
    row - pandas dataframe with 'price'
    
    OUTPUT:
    the true value of price
    '''
    f = row['price']
    return float(sub(r'[^\d.]', '', f))

df['price_value'] = df.apply(currency_to_float, axis=1)
df.drop(['price'], axis=1, inplace=True)

In [None]:
# df['neighbourhood_cleansed'].value_counts()

In [None]:
df['room_type'].value_counts()

In [None]:
#Dropping where the availability_30 has missing values
df  = df.dropna(subset=[response], axis=0)
print(df.shape)
df.info()

In [None]:
def create_dummy_df(df, cat_cols, dummy_na):
    '''
    INPUT:
    df - pandas dataframe with categorical variables you want to dummy
    cat_cols - list of strings that are associated with names of the categorical columns
    dummy_na - Bool holding whether you want to dummy NA vals of categorical columns or not
    
    OUTPUT:
    df - a new dataframe that has the following characteristics:
            1. contains all columns that were not specified as categorical
            2. removes all the original columns in cat_cols
            3. dummy columns for each of the categorical columns in cat_cols
            4. if dummy_na is True - it also contains dummy columns for the NaN values
            5. Use a prefix of the column name with an underscore (_) for separating 
    '''
    for col in  cat_cols:
        try:
            # for each cat add dummy var, drop original column
            df = pd.concat([df.drop(col, axis=1), pd.get_dummies(df[col], prefix=col, prefix_sep='_', drop_first=True, dummy_na=dummy_na)], axis=1)
        except:
            continue
    return df

In [None]:
#Pull a list of the column names of the categorical variables
cat_df = df.select_dtypes(include=['object'])
cat_cols_lst = cat_df.columns

df_new = create_dummy_df(df, cat_cols_lst, dummy_na=False)

# Show shape to assure it has a shape of (5009, 11938)
print(df_new.shape)

In [None]:
list(df_new)

In [None]:
df_new = df_new[df_new['price_value']!=0]

In [None]:
# list(df_new)

# Deal with missing values and create model.

In [None]:
# Mean function
# fill none value with mean.
fill_mean = lambda col: col.fillna(col.mean())
# Fill the mean
fill_df = df_new.apply(fill_mean, axis=0)

#Split into explanatory and response variables
X = fill_df.drop([response], axis=1)
y = fill_df[response]

#Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .30, random_state=42) 

lm_model = LinearRegression(normalize=\False) # Instantiate
lm_model.fit(X_train, y_train) #Fit
 
#Predict using your model
y_test_preds = lm_model.predict(X_test)
y_train_preds = lm_model.predict(X_train)

# #Score using your model
test_score = r2_score(y_test, y_test_preds)
train_score = r2_score(y_train, y_train_preds)

print("The rsquared on the training data was {}.  The rsquared on the test data was {}.".format(train_score, test_score))

In [None]:
lm_model.coef_

In [None]:
#step 2: find influence on score

In [None]:
scores = listings_detail[[#'id',
                          'review_scores_rating', 'review_scores_accuracy', 
                          'review_scores_cleanliness', 'review_scores_checkin',
                          'review_scores_communication', 'review_scores_location',
                          'review_scores_value']]
scores.dropna(inplace=True)

In [None]:
# find correlations between related variables
corr = scores.corr()
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(corr,cmap='coolwarm', vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,len(scores.columns),1)
ax.set_xticks(ticks)
plt.xticks(rotation=90)
ax.set_yticks(ticks)
ax.set_xticklabels(scores.columns)
ax.set_yticklabels(scores.columns)
plt.show()
fig.savefig('correlation.png', optimize=True)

In [None]:
# step three Dig into comments

reviews = reviews.dropna(subset=['comments'])
reviews.info()


def clean_content(df):
    '''
    INPUT:
    df - pandas dataframe with 'comments'
    
    OUTPUT:
    cleaned content without symbols
    '''
    # filter out symbols
    f = df['comments']
#     r1 = u'[a-zA-Z0-9’!"#$%&\'()*+,-./:;<=>?@，。?★、…【】《》？“”‘’！[\\]^_`{|}~]+'
    r1 = u'[0-9’!"#$%&\'()*+,-./:;<=>?@，。?★、…【】《》？“”‘’！[\\]^_`{|}~]+'
    return re.sub(r1,'', f).replace('\n', ' ').replace('\r', '')

reviews['cleaned_comments'] = reviews.apply(clean_content, axis=1)


def is_Chinese(df):
    '''
    INPUT:
    df - pandas dataframe with 'comments'
    
    OUTPUT:
    bool showing if the input is in Chinese
    '''
    f = df['comments']
    HZPattern = re.compile(u'[\u4e00-\u9fa5]+')
    return HZPattern.search(f)
reviews['is_Chinese'] = reviews.apply(is_Chinese, axis=1)

reviews_English = reviews[reviews['is_Chinese'].isnull()]


neighbourhood_id = df_list[['id', 'neighbourhood']]
reviews_English = reviews_English.merge(neighbourhood_id, how='left', left_on = 'listing_id', right_on = 'id')

text = " ".join(review for review in reviews_English['cleaned_comments'])
# Create stopword list:
from nltk.corpus import stopwords
STOPWORDS = stopwords.words('english')
stopwords = set(STOPWORDS)
stopwords.update(["also", "room", "apartment", "posting", "canceled","reservation", "host", "Beijing", "stay",
                 "arrival", "days", "house", "good", "nice", "one", "really", "area", "even"])

# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)

# Display the generated image:
# the matplotlib way:
fig = plt.figure(figsize=(8, 4))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
fig.savefig('wordcloud.png')