In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.ensemble import *
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.decomposition import PCA
from scipy.stats import mode
from sklearn.impute import SimpleImputer
import warnings
from sklearn import metrics
import xgboost as xgb
import lightgbm as lgb
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor


In [2]:
train_raw = pd.read_csv("data/train.csv")
test_raw = pd.read_csv("data/test.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [22]:
def drop_initial(df): 
    cols = ["id", "name", "summary", "space", "experiences_offered", 
            "access", "host_acceptance_rate", "notes",
            "description", "neighborhood_overview", "interaction",
            "house_rules", "host_id", "host_name", "host_about",
            "host_location", "city", "state", "zipcode", "country",
            "country_code", "host_response_time", "host_neighbourhood",
             "host_listings_count",
            "host_response_rate", "square_feet", "host_verifications",
            "maximum_nights",
            "minimum_nights","is_business_travel_ready", "first_review", 
            "last_review"]
    return df.drop(cols,  axis = 1)
def to_num(df):
    df["extra_people"] = df.extra_people.str.replace("$", "", regex = False).astype(float)
    return df
def remove_outliers(data):
    df = data.copy(deep = True)
    df = df[(df["reviews_per_month"] < 10) | (df["reviews_per_month"].isnull())]
    df = df[(df["bathrooms"] < 10) | (df["bathrooms"].isnull())]
    df = df[(df["review_scores_cleanliness"] > 2) | (df["review_scores_cleanliness"].isnull())]
    df = df[(df["review_scores_accuracy"] > 2) | (df["review_scores_accuracy"].isnull())]
    df = df[(df["review_scores_location"] > 2) | (df["review_scores_location"].isnull())]
    df = df[(df["review_scores_checkin"] > 2) | (df["review_scores_checkin"].isnull())]
    df = df[(df["review_scores_communication"] > 2) | (df["review_scores_communication"].isnull())]
    df = df[(df["review_scores_value"] > 2) | (df["review_scores_value"].isnull())]
    df = df[(df["number_of_reviews"] < 300) | (df["number_of_reviews"].isnull())]
    return df
# 进行feature tranformation
def engineer_features(df):
    # transform host_since
    since = pd.to_datetime(df["host_since"]).dt
    df = df.drop("host_since", axis = 1)
    df["since_year"] = since.year
    df["since_month"] = since.month
    df["since_day"] = since.day
    # transform binary columns
    tfmap = {"f": 0, "t": 1}
    df["host_is_superhost"] = df["host_is_superhost"].fillna("f").map(tfmap)
    df["host_identity_verified"] = df["host_identity_verified"].fillna("f").map(tfmap)
    df["require_guest_phone_verification"] = df["require_guest_phone_verification"].map(tfmap)
    df["instant_bookable"] = df["instant_bookable"].map(tfmap)
    df["host_has_profile_pic"] = df["host_has_profile_pic"].fillna("f").map(tfmap)
    df["require_guest_profile_picture"] = df["require_guest_profile_picture"].fillna("f").map(tfmap)
    
    # transform amenities
    df["amenities"] = train_raw["amenities"].str.replace("{", "", regex = False)\
                                             .replace("}", "", regex = False)\
                                             .str.split(",").apply(len)
    df["transit"] = df["transit"].apply(lambda x: 0 if pd.isnull(x) else 1)
    # transform property types
#     ptmap = {"Villa":"Other",
#              "Bungalow":"Other",
#              "Tiny house":"Other",
#              "Aparthotel":"Other",
#              "Boat":"Other",
#              "Camper/RV":"Other",
#              "Tent":"Other",
#              "Cottage":"Other",
#              "Houseboat":"Other",
#              "Cabin":"Other",
#              "Chalet":"Other",
#              "Timeshare":"Other",
#              "Train":"Other",
#              "Island":"Other",
#              "Casa particular (Cuba)":"Other"}
    ptmap = {"Villa":"Other_value",
                 "Bungalow":"Other",
                 "Tiny house":"Other",
                 "Aparthotel":"Other_lux",
                 "Boat":"Other",
                 "Camper/RV":"Other",
                 "Tent":"Other",
                 "Cottage":"Other",
                 "Houseboat":"Other_lux",
                 "Cabin":"Other",
                 "Chalet":"Other",
                 "Timeshare":"Other_lux",
                 "Train":"Other_value",
                 "Island":"Other_value",
                 "Casa particular (Cuba)":"Other_value",
                 "Resort": "Other_lux",
                 "Hotel": "Other_lux",
                 "Hostel": "Other_value"}
    df["property_type"].replace(ptmap, inplace = True)
    # transforom cancellation policy
#     cpmap = {"super_strict_60": "strict",
#              "super_strict_30": "strict",
#              "strict": "strict",
#              "long_term": "strict"}
#     df["cancellation_policy"].replace(cpmap, inplace = True)
    # na
    return df
def filling_values(df):
    modes = ["bathrooms", "bedrooms", 
             "beds", "since_year", 
             "since_month", "since_day", 
             "market"]
    
    means = ["review_scores_rating",
    "review_scores_accuracy",
    "review_scores_cleanliness",
    "review_scores_checkin",
    "review_scores_communication",
    "review_scores_location",
    "review_scores_value"]
    filling = {}
    for i in modes:
        filling[i] = mode(df[i]).mode[0]
    for i in means:
        filling[i] = np.mean(df[i])
    filling["reviews_per_month"] = 0
    return filling
def fill_missing(df, fill_dict):
#     fill_values
#     df["bathrooms"] = df.groupby("room_type")["bathrooms"].transform(lambda x: x.fillna(mode(x).mode[0]))
#     df["bedrooms"] = df.groupby("accommodates")["bedrooms"].transform(lambda x: x.fillna(mode(x).mode[0]))
#     df["beds"] = df.groupby("accommodates")["beds"].transform(lambda x: x.fillna(mode(x).mode[0]))
#     df["since_year"] = df["since_year"].fillna(mode(df["since_year"]).mode[0])
#     df["since_month"] = df["since_month"].fillna(mode(df["since_month"]).mode[0])
#     df["since_day"] = df["since_day"].fillna(mode(df["since_day"]).mode[0])
#     df["market"] = df["market"].fillna(mode(df["market"]).mode[0])
#     tofill = ["review_scores_rating",
#     "review_scores_accuracy",
#     "review_scores_cleanliness",
#     "review_scores_checkin",
#     "review_scores_communication",
#     "review_scores_location",
#     "review_scores_value"        
#     ]
#     for i in tofill:
#         df[i] = df[i].fillna(np.mean(df[i]))
#     df["reviews_per_month"] = df["reviews_per_month"].fillna(0)
    for i in fill_dict:
        df[i].fillna(fill_dict[i], inplace = True)
    return df
def pre_transformation(df):
    res = drop_initial(df)
    res = to_num(res)
    res = remove_outliers(res)
    res = engineer_features(res)
    fill_dict = filling_values(res)
    res = fill_missing(res, fill_dict)
    return res, fill_dict
pt = pre_transformation(train_raw)[0]
# Column transformer
categorical = ["neighbourhood_cleansed", "neighbourhood_group_cleansed", "property_type", "room_type", "bed_type", "cancellation_policy",
              "market"]
std= ["accommodates", "bathrooms", "bedrooms", "beds", "amenities", "guests_included",
      "extra_people", "number_of_reviews", "review_scores_rating", "review_scores_accuracy",
     "review_scores_cleanliness", "review_scores_checkin", "review_scores_communication",
     "review_scores_location", "review_scores_value", "calculated_host_listings_count","since_year",
     "since_month", "since_day"]
clt = ColumnTransformer(
    transformers=[
        ("one-hot", OneHotEncoder(handle_unknown="ignore", sparse=False), categorical),
        ("standardization", StandardScaler(), std)
    ],
    remainder="passthrough"
)
clt.fit(pt.drop("price", axis = 1))

def preprocessing_train(df = train_raw):
    res = pre_transformation(df)[0]
    fill = pre_transformation(df)[1]
    try:
        res.drop("price", axis = 1)
    except:
        return clt.transform(res)
    X = clt.transform(res.drop("price", axis = 1))
    y = res.price
    return X, y
def preprocessing_test(df = test_raw):
    return clt.transform(fill_missing(engineer_features(remove_outliers(to_num(drop_initial(test_raw)))), fill))

In [23]:
# train preprocessing
X, y = preprocessing_train()

In [24]:
#test preprocessing
test = preprocessing_test()

In [None]:

def tune_lgbm():
    res = []
    best_max_dp = [15,16,17]
    best_min_data_in_leaf = [21, 22, 23,24, 25]
    num_leaves = [45, 50, 55, 60, 65]
    for i in best_min_data_in_leaf:
        for j in num_leaves:
            for k in best_max_dp:
                gbm = lgb.LGBMRegressor(max_depth=k, min_data_in_leaf = i, num_leaves = j)
                res.append([-(cross_val_score(gbm, X, y, cv = 5, scoring="neg_root_mean_squared_error").mean()), i, j,k ])
    return res
res_lgbm = tune_lgbm()

In [100]:
res_lgbm#22， 45， 15， 85.03

In [10]:
%%time
def tune_xgb():
    depth = [6, 7, 9, 10]
    min_cw = [4, 5, 7, 8]
    total = len(depth) * len(min_cw)
    counter = 0
    res = []
    for i in depth:
        for j in min_cw:
            xgbr = xgb.XGBRegressor(max_depth=i, learning_rate=0.1, n_estimators=100, min_child_weight = j)
            res.append([-(cross_val_score(xgbr, X, y, cv = 5, scoring="neg_root_mean_squared_error").mean()), i, j])
            counter+=1
            print("****"*counter + "----"*(total-counter)+"|"+str(100*round(counter/total, 2))+"%")
    return res
res_xgb = tune_xgb()
ring()

KeyboardInterrupt: 

In [12]:
xgbr = xgb.XGBRegressor(eta = 0.05, max_depth=5, learning_rate=0.1, n_estimators=100, min_child_weight = 5, eval_metric  = "rmse")

In [17]:
import fun
fun.ring()