In [1]:
import pandas as pd


In [2]:
# Function for transforming both training/testing
def clean(data):
    if 'Unnamed: 0' in data.columns:
        data = data.drop(columns=['Unnamed: 0', 'id'])
    else: 
        data = data.drop(columns=['id'])
    
    data['bathrooms'] = data['name'].apply(lambda x: x.split('·')[4] if isinstance(x, str) and len(x.split('·')) > 4 else None)
    data['bedrooms'] = data['name'].apply(lambda x: x.split('·')[3] if isinstance(x, str) and len(x.split('·')) > 3 else None)
    
    # No more use of name
    data = data.drop(columns=['name'])
    
    data['bathrooms'] = pd.to_numeric(data['bathrooms'], errors='coerce')  # Convert to numeric
    data['bedrooms'] = pd.to_numeric(data['bedrooms'], errors='coerce')  # Convert to numeric
    data['price'] = data['price'].replace('[\$,]', '', regex=True).astype(float)  # Remove $ and commas, convert to float
    data['host_response_rate'] = data['host_response_rate'].str.rstrip('%').astype(float)  # Remove % and convert to float
    data['host_acceptance_rate'] = data['host_acceptance_rate'].str.rstrip('%').astype(float)  # Remove % and convert to float


    # Convert columns to boolean
    boolean_columns = ['host_is_superhost', 'instant_bookable']
    data[boolean_columns] = data[boolean_columns].apply(lambda x: x.map({'t': 1, 'f': 0})).astype('float')

    # Convert columns to int

    # Convert revenue to float
    if 'monthly_revenue' in data.columns:
        data['monthly_revenue'] = data['monthly_revenue'].astype('float')

    categorical_columns = [
        'host_response_time',
        'neighbourhood',
        'neighbourhood_cleansed',
        'property_type',
        'room_type'
        ]   
    data[categorical_columns] = data[categorical_columns].astype('category')
    return data


  data['price'] = data['price'].replace('[\$,]', '', regex=True).astype(float)  # Remove $ and commas, convert to float


In [3]:
training = pd.read_csv('data/train.csv')
testing = pd.read_csv('data/test.csv')
training = clean(training)
testing = clean(testing)

In [6]:
training.head()

Unnamed: 0,neighborhood_overview,host_id,host_name,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,neighbourhood,...,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,reviews_per_month,monthly_revenue
0,Everything you need is nearby. <br /><br />Hig...,57488206,Jessi,,,,0.0,3,3,"Vancouver, British Columbia, Canada",...,4.8,4.82,4.9,4.87,4.69,4.81,0.0,3,0.77,2108.0
1,,139792573,Daniel,within a few hours,100.0,100.0,0.0,1,4,,...,,,,,,,0.0,1,,2730.0
2,Beautiful neighbourhood close to prosperous Ma...,265504225,Alex,within an hour,100.0,98.0,1.0,1,1,"Vancouver, British Columbia, Canada",...,4.9,4.78,4.97,4.94,4.9,4.75,0.0,1,3.22,2254.0
3,We are located in a quiet residential neighbor...,22595056,Raymond,,,92.0,1.0,1,1,"Vancouver, British Columbia, Canada",...,5.0,5.0,5.0,5.0,4.86,5.0,0.0,1,1.28,3187.0
4,Kitsilano at it's best! Short walk to all the ...,65683877,Yendi,within an hour,100.0,95.0,1.0,2,3,"Vancouver, British Columbia, Canada",...,4.93,4.89,4.97,4.97,4.96,4.85,0.0,1,2.01,3479.0


In [7]:
testing.head()

Unnamed: 0,neighborhood_overview,host_id,host_name,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_listings_count,host_total_listings_count,neighbourhood,...,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,calculated_host_listings_count,reviews_per_month
0,We are less than a block to Kits pool - the la...,23168796,Oliver & Ashleigh,within a few hours,100.0,67.0,0.0,1,1,"Vancouver, British Columbia, Canada",...,4.98,4.93,4.95,5.0,5.0,4.98,4.88,0.0,1,0.38
1,One of the most eclectic areas in the Lower Ma...,16926150,Jesse,,,,0.0,1,1,"Vancouver, British Columbia, Canada",...,,,,,,,,0.0,1,
2,Le Soleil Hotel and Suites is situated in the ...,536871978,Ivan,within an hour,100.0,95.0,0.0,4,4,"Vancouver, British Columbia, Canada",...,5.0,5.0,4.88,5.0,5.0,5.0,4.88,0.0,4,2.96
3,"Very safe and quite area,also very convenient ...",421206568,Xuerong,within an hour,100.0,98.0,1.0,4,4,"Vancouver, British Columbia, Canada",...,4.79,4.87,4.84,4.93,4.96,4.77,4.69,0.0,4,2.53
4,Mount Pleasant is a largely residential area w...,227662329,Jordan,within an hour,100.0,100.0,0.0,168,232,"Vancouver, British Columbia, Canada",...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,1.0,134,0.47


In [8]:
print(training.dtypes)

neighborhood_overview               object
host_id                              int64
host_name                           object
host_response_time                category
host_response_rate                 float64
host_acceptance_rate               float64
host_is_superhost                  float64
host_listings_count                  int64
host_total_listings_count            int64
neighbourhood                     category
neighbourhood_cleansed            category
latitude                           float64
longitude                          float64
property_type                     category
room_type                         category
accommodates                         int64
bathrooms                          float64
bedrooms                           float64
beds                               float64
amenities                           object
price                              float64
minimum_nights                       int64
maximum_nights                       int64
minimum_nig

## Perform Regression for rating predictions