In [None]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt

In [None]:
def clean_data(df):
    from feature_engine.imputation import CategoricalImputer
    \
    # clean up data types
    
    ## datetime
    df['host_since'] = pd.to_datetime(df['host_since']).fillna(datetime.datetime.now())
    df['first_review'] = pd.to_datetime(df['first_review']).fillna(datetime.datetime.now())
    df['last_review'] = pd.to_datetime(df['last_review']).fillna(datetime.datetime.now())


    df['experiences_offered'] = df['experiences_offered'].replace({'none': np.nan})

    df['room_type'] = df['room_type'].replace({'Entire home/apt': 2, 'Private room': 1, 'Shared room': 0})

    ## make all boolean missing values 0
    df[['instant_bookable','is_business_travel_ready', 'require_guest_profile_picture','require_guest_phone_verification']] = \
    df[['instant_bookable','is_business_travel_ready', 'require_guest_profile_picture','require_guest_phone_verification']].replace(
        {'f': 0, 't': 1}).fillna(0)
    df[['host_is_superhost','host_has_profile_pic','host_identity_verified']] = \
        df[['host_is_superhost','host_has_profile_pic','host_identity_verified']].replace(
            {'f': 0, 't': 1}).fillna(0)
    

    df['extra_people'] = df['extra_people'].apply(lambda x: float(x[1:]))
    df['host_response_rate'] = df['host_response_rate'].apply(lambda x: int(x.strip('%')) if type(x) == str else x)

    # Turn into actual lists
    df['host_verifications'] = df['host_verifications'].str.lower().str.replace(' ', '').str.strip('][').str.split(',').fillna('[]')
    df['amenities'] = df['amenities'].str.lower().str.replace('"','').str.replace(' ', '').str.strip('}{').str.split(',').apply(set)

    # drop many missing columns
    many_missing = ['experiences_offered', 'neighborhood_overview', 'notes', 'transit', 
       'access', 'interaction', 'house_rules', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'square_feet']
    df = df.drop(columns=many_missing)

    # Imputation
    df[['beds', 'bedrooms', 'bathrooms']] = df[['beds', 'bedrooms', 'bathrooms']].fillna(1)
    
    df[['beds', 'bedrooms', 'bathrooms']] = df[['beds', 'bedrooms', 'bathrooms']].replace({0:-1})
    
    df[['review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value', 'reviews_per_month',
       'host_listings_count', 'calculated_host_listings_count']] = \
        df[['review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value', 'reviews_per_month',
       'host_listings_count', 'calculated_host_listings_count']].fillna(
             df[['review_scores_rating',
       'review_scores_accuracy', 'review_scores_cleanliness',
       'review_scores_checkin', 'review_scores_communication',
       'review_scores_location', 'review_scores_value', 'reviews_per_month',
       'host_listings_count', 'calculated_host_listings_count']].median()
         )
    
    categorical_imp = CategoricalImputer()
    df[['name', 'summary', 'space', 'description', 'market', 'city', 'state', 'zipcode']] = \
        categorical_imp.fit_transform(df[['name', 'summary', 'space', 'description', 'market', 'city', 'state',
       'zipcode']])

    # New columns
    df['maximum_stay'] = df['maximum_nights'] - df['minimum_nights']
    df['bath_bed_ratio'] = (df['bathrooms'] / df['bedrooms']).fillna(0)
    df['num_amenities'] = (df['amenities'].apply(len)).fillna(0)
    df['num_verifications'] = (df['host_verifications'].apply(len)).fillna(0)
    df['total_host_time'] = (datetime.datetime.now() - df['host_since']).dt.total_seconds()
    df['since_first_review'] = (datetime.datetime.now() - df['first_review']).dt.total_seconds()
    df['since_last_review'] = (datetime.datetime.now() - df['last_review']).dt.total_seconds()

    df['extra_guest_price'] = df['guests_included'] * df['extra_people']
    df['max_people'] = df['guests_included'] + df['accommodates']
    df['has_pool'] = df['amenities'].apply(lambda x: 1 if 'pool' in x else 0)
    df['has_wifi'] = df['amenities'].apply(lambda x: 1 if 'wifi' in x or 'ethernetconnection' in x else 0)
    df['has_staff'] = df['amenities'].apply(lambda x: 1 if 'buildingstaff' in x  or 'selfcheck-in' in x else 0)
    df['allows_pets'] = df['amenities'].apply(lambda x: 1 if 'petsliveonthisproperty' in x or 'cat(s)' in x or 'dog(s)' in x else 0)
    df['single_level'] = df['amenities'].apply(lambda x: 1 if 'singlelevelhome' in x else 0)
    df['street_parking'] = df['amenities'].apply(lambda x: 1 if 'freestreetparking' in x else 0)


    #Choose only US countries
    df = df[df['country_code'] == 'US']
    
    return df


In [None]:
train = pd.read_csv('train.csv', low_memory=False)
train_airbnb = clean_data(train)

In [None]:
(train_airbnb[['calculated_host_listings_count', 'extra_guest_price', 'review_scores_rating',
                'review_scores_location', 'review_scores_cleanliness', ]] ==0).sum()

In [None]:
def model(df):
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import OneHotEncoder
    from sklearn.preprocessing import QuantileTransformer
    from sklearn.pipeline import Pipeline, make_pipeline
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.decomposition import TruncatedSVD
    from catboost import CatBoostRegressor
    from feature_engine.outliers import Winsorizer
    from feature_engine.transformation import LogCpTransformer
    from sklearn.preprocessing import PowerTransformer
    from sklearn.decomposition import LatentDirichletAllocation

    features = [
        'description', 'calculated_host_listings_count', 'extra_guest_price', 'review_scores_rating',
        'review_scores_location', 'review_scores_cleanliness', 'maximum_stay', 'beds',
        'total_host_time', 'num_amenities', 'bedrooms', 'neighbourhood_group_cleansed', 'property_type', 
        'room_type', 'cancellation_policy', 'host_is_superhost', 'host_identity_verified',
        'require_guest_phone_verification', 'has_pool', 'has_staff', 'single_level', 'street_parking',
        'accommodates', 'guests_included', 'max_people', 'number_of_reviews', 'extra_people', 'minimum_nights',
        'bath_bed_ratio', 'name', 'host_neighbourhood', 'num_verifications'
    ]
    X = df[features]
    y = df['price']
    
    
    pre_processing = ColumnTransformer(
        transformers=[
            ('outliers', Winsorizer(), ['maximum_stay', 'minimum_nights']),
            ('desc_modeling', make_pipeline(
                        CountVectorizer(analyzer='word', ngram_range=(1, 2), stop_words='english'), 
                        LatentDirichletAllocation(n_components=200, random_state=7)), 
                        'description'),
            ('name_modeling', make_pipeline(
                        CountVectorizer(analyzer='word', ngram_range=(1, 2), stop_words='english'), 
                        TruncatedSVD(n_components=200, random_state=7)), 
                        'name'),
            ('power', PowerTransformer(method='box-cox'), [
                'calculated_host_listings_count', 'review_scores_rating',
                'review_scores_location', 'review_scores_cleanliness', 
            ]),
            ('quant', QuantileTransformer(n_quantiles=10, random_state=0), [
                'beds','extra_guest_price'
            ]),
            ('scaler', StandardScaler(), [
                'total_host_time', 'num_amenities', 'bedrooms', 'minimum_nights', 'maximum_stay', 'bath_bed_ratio', 'num_verifications', 
            ]),
            ('one-hot', OneHotEncoder( handle_unknown='infrequent_if_exist'), [
                'neighbourhood_group_cleansed', 'property_type', 'cancellation_policy',
                'host_is_superhost', 'host_identity_verified', 'require_guest_phone_verification', 
                'has_pool', 'has_staff', 'single_level', 'street_parking', 'host_neighbourhood',
            ]),
            ('log', LogCpTransformer(), [
                'accommodates', 'guests_included', 'max_people', 'number_of_reviews', 'extra_people',
            ]),   
        ],
        remainder= 'drop'
    )
    pl = Pipeline([
                    ('preprocessor', pre_processing), 
                    ('cat', CatBoostRegressor())
                ])
    pl.fit(X, y)
    return pl

In [None]:
features = [
        'description', 'calculated_host_listings_count', 'extra_guest_price', 'review_scores_rating',
        'review_scores_location', 'review_scores_cleanliness', 'maximum_stay', 'beds',
        'total_host_time', 'num_amenities', 'bedrooms', 'neighbourhood_group_cleansed', 'property_type', 
        'room_type', 'cancellation_policy', 'host_is_superhost', 'host_identity_verified',
        'require_guest_phone_verification', 'has_pool', 'has_staff', 'single_level', 'street_parking',
        'accommodates', 'guests_included', 'max_people', 'number_of_reviews', 'extra_people', 'minimum_nights',
        'bath_bed_ratio', 'name', 'host_neighbourhood', 'num_verifications'
    ]

In [None]:
from sklearn.model_selection import KFold, cross_validate
X_train = train_airbnb[features]
y_train = train_airbnb['price']
k_folds = KFold(n_splits = 3)

cv_results2 = cross_validate(model(train_airbnb), X_train, y_train, cv=k_folds, scoring='r2')
np.mean(cv_results2['test_score'])

In [None]:
test = pd.read_csv('test.csv', low_memory=False)
test_airbnb = clean_data(test)

kaggle = pd.DataFrame()
kaggle['Id'] = test_airbnb['id']
kaggle['Predicted'] = model(train_airbnb).predict(test_airbnb)
kaggle
kaggle.to_csv('attempt26submission.csv', index = False)