In [1]:
import glob
import os
import re
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from IPython import get_ipython
from nltk.corpus import stopwords
from nltk.stem.porter import *
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

get_ipython().run_line_magic('matplotlib', 'inline')
pd.set_option("display.max_columns", 500)

In [None]:
def loadFiles():
    all_files = glob.glob('/Users/yc/Desktop/Airbnb/raw_data' + "/*.csv")
    li = []
    row_check = 0

    for filename in all_files:
        sub_df = pd.read_csv(filename, index_col=None, header=0)
        row_check += sub_df.shape[0]
        print("{} number of columns: {}".format(filename, sub_df.shape[1]))
        li.append(sub_df)

    df = pd.concat(li, axis=0, ignore_index=True)
    print("data rows report: \n")
    print("total row without concat:{}".format(row_check))
    print("total row with concat:{}".format(df.shape[0]))

    print("missing value: \n")
    print(df.isna().sum().sort_values(axis = 0, ascending = False)[:50])
    print(df.isna().sum().sort_values(axis = 0, ascending = True)[:50])

    return df


def data_filter(column_list):
    return df[filtered_columns]


def replace_na():
    # df.drop(index = df.index[41098], inplace = True)

    # categorical
    df['name'].replace({None : 'Unknown'}, inplace = True)
    df['description'].replace({None : 'Unknown'}, inplace = True)
    df['instant_bookable'].replace({None : 'Unknown'}, inplace = True)
    df['state'].replace({None : 'Unknown'}, inplace = True)

    # numeric
    numeric_columns = df.select_dtypes(include=['float64']).columns
    df[numeric_columns] = df[numeric_columns].fillna(0)
    df.reset_index(drop = True, inplace = True)

    # price (label)
    df['price'] = df['price'].map(lambda x: float(re.sub(r"[^0-9]", "", x)))


def amentity_parser():
    df_amenities_new = []
    amenity_set = set()
    for i in range(len(df.amenities)):
        word_list = str(df.amenities[i]).split(',')
        word_list_tmp = []
        for word in word_list:
            word = re.sub(r"[^a-zA-Z0-9]", " ", word.lower())
            word = word.strip()
            word_list_tmp.append(word)
            amenity_set.add(word)
        df_amenities_new.append(word_list_tmp)
    amenity_set.remove('')  # remove empty string

    return amenity_set, df_amenities_new


# there are 325 categories in a set, need to reduce the set
def amenity_dataframe():
    amenity_set, df_amenities_new = amentity_parser()
    amenity_onehot_list = []

    for i in range(len(df_amenities_new)):
        row = df_amenities_new[i]
        amenity_onehot_tmp = []
        for amenity in amenity_set:
            if amenity in row:
                amenity_onehot_tmp.append(1)
            else:
                amenity_onehot_tmp.append(0)
        amenity_onehot_list.append(amenity_onehot_tmp)
    
    return pd.DataFrame(amenity_onehot_list, columns = amenity_set)


def frequent_columns(amenity_df, p = 0.3):
    describe_df = amenity_df.describe().transpose()
    columns_list = list(describe_df[describe_df['mean'] > p].index)

    return columns_list

In [None]:
filtered_columns = ['host_id', 'name', 'room_type', 'price', 'reviews_per_month',
                    'minimum_nights', 'calculated_host_listings_count', 
                    'number_of_reviews', 'description', 'amenities', 'accommodates',
                    'availability_365', 'instant_bookable', 'state']  # 'city', 'neighbourhood', 'latitude', 'longitude'
df = loadFiles()
df = data_filter(filtered_columns)
replace_na()
amenity_df = amenity_dataframe()
amenity_df = amenity_df[frequent_columns(amenity_df, p = 0.2)]

df.drop(['amenities'], axis = 1, inplace = True)
df = pd.concat([df, amenity_df], axis = 1)

In [None]:
# text feature
def review_to_words(review):
    nltk.download("stopwords", quiet = True)
    stemmer = PorterStemmer()
    
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words

# categorical features
# %%
def categorical_encoder(train_feature, test_feature = None, has_testset = False):

    label_encoder = LabelEncoder()
    onehot_encoder = OneHotEncoder(handle_unknown='ignore')

    train_tmp = label_encoder.fit_transform(train_feature)
    train_onehot_result = onehot_encoder.fit_transform(np.array(train_tmp).reshape(-1,1)).toarray()

    test_tmp = label_encoder.transform(test_tmp)
    test_onehot_result = label_encoder.transform(test_tmp)
    return [train_onehot_result, test_onehot_result]

instant_bookable = categorical_encoder(df.neighbourhood)
room_type = categorical_encoder(df.neighbourhood)
state = categorical_encoder(df.neighbourhood)

# numeric features
# price

# other features
def numeric_scaler(train_features, test_features = None, testset_flag = False):

    scaler = MinMaxScaler()
    
    if testset_flag:
        train_val = scaler.fit_transform(train_features)
        test_val = scaler.transform(test_features)
        return train_val, test_val
    else: 
        train_val = scaler.fit_transform(train_features)
        return train_val

 numeric_features = numeric_scaler(df[['minimum_nights', 
                                        'calculated_host_listings_count', 
                                        'reviews_per_month', 
                                        'number_of_reviews', 
                                        'accommodates', 
                                        'availability_365']], 
                                        test_features = None, 
                                        testset_flag = False)