In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.ensemble import *
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.decomposition import PCA
from scipy.stats import mode
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings("ignore")

In [2]:
airbnb = pd.read_csv("data/train.csv")
airbnb.head()

Unnamed: 0,id,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,...,review_scores_communication,review_scores_location,review_scores_value,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,reviews_per_month
0,22267382,Modern and Cozy Large Studio in Brooklyn,Modern large studio with new amenities and app...,Our place is a little quiet sanctuary in the h...,Modern large studio with new amenities and app...,none,"BAM, Barclays, Brooklyn City Point, Fort Green...",,"Subway: 2,3,4,5,A,C,B,Q,G",Washer/Dryer Dishwasher Internet Gym Roof Top ...,...,10.0,10.0,10.0,t,f,flexible,f,f,1,0.59
1,2473861,Royal Harlem TRIPLEX Home 5 Beds,Harlem is back and so gorgeous! Visit and expl...,Harlem is back and so gorgeous! Visit and expl...,Harlem is back and so gorgeous! Visit and expl...,none,HARLEM is a piece of real NY history overflowi...,HARLEM RESTAURANTS Red Rooster Harlem -- excel...,PUBLIC TRANSPORTATION: Conveniently near all p...,The WHOLE ENTIRE HOUSE,...,9.0,9.0,9.0,t,f,moderate,f,f,3,2.47
2,25079703,Sunny East Village Studio,"Clean, hip and well designed sun drenched East...",This is a rare East Village studio with it's h...,"Clean, hip and well designed sun drenched East...",none,East Village is one of the last remaining neig...,,,You'll have access to the entire space - it's ...,...,10.0,10.0,10.0,f,f,moderate,f,f,1,0.89
3,9342478,"Beautiful, airy, light-filled room","Private, spacious, comfortable room in 2-bed f...","Big closet, two big windows, tall ceiling and ...","Private, spacious, comfortable room in 2-bed f...",none,One block from Morgan L stop. Super cool area....,,,,...,,,,f,f,flexible,f,f,1,
4,4866426,Private Room in Prime Brooklyn Spot,"Comfy, quiet and big private room in a three b...",This big old apartment that we love and take c...,"Comfy, quiet and big private room in a three b...",none,I absolutely love this neighborhood - right at...,Just a note about the space: The window in you...,Super convenient to almost all subway lines. A...,Your room has a very comfortable queen sized b...,...,10.0,10.0,10.0,f,f,flexible,f,f,1,3.14


In [None]:
airbnb.info()

In [None]:
airbnb.describe()

In [None]:
cols = ["id", "name", "summary", "space", "experiences_offered", 
            "access", "host_acceptance_rate", "notes",
            "description", "neighborhood_overview", "interaction",
            "house_rules", "host_id", "host_name", "host_about",
            "host_location", "city", "state", "zipcode", "country",
            "country_code", "host_response_time", "host_neighbourhood",
            "host_has_profile_pic", "host_listings_count", "market",
            "host_response_rate", "square_feet", "host_verifications",
            "maximum_nights", "calculated_host_listings_count",
            "require_guest_profile_picture", "minimum_nights",
            "is_business_travel_ready", "first_review", "last_review",
           ]
airbnb[airbnb["square_feet"].notnull()].sort_values(by = "square_feet")

In [3]:
# 初步drop缺失值过多、长段文字、或者host-unique的features
def drop_initial(df): 
    cols = ["id", "name", "summary", "space", "experiences_offered", 
            "access", "host_acceptance_rate", "notes",
            "description", "neighborhood_overview", "interaction",
            "house_rules", "host_id", "host_name", "host_about",
            "host_location", "city", "state", "zipcode", "country",
            "country_code", "host_response_time", "host_neighbourhood",
             "host_listings_count",
            "host_response_rate", "square_feet", "host_verifications",
            "maximum_nights",
            "minimum_nights","is_business_travel_ready", "first_review", 
            "last_review"]
    return df.drop(cols,  axis = 1)

In [4]:
def to_num(df):
    df["extra_people"] = df.extra_people.str.replace("$", "", regex = False).astype(float)
    return df

In [5]:
def remove_outliers(data):
    df = data.copy(deep = True)
    df = df[(df["reviews_per_month"] < 10) | (df["reviews_per_month"].isnull())]
    df = df[(df["bathrooms"] < 10) | (df["bathrooms"].isnull())]
    df = df[(df["review_scores_cleanliness"] > 2) | (df["review_scores_cleanliness"].isnull())]
    df = df[(df["review_scores_accuracy"] > 2) | (df["review_scores_accuracy"].isnull())]
    df = df[(df["review_scores_location"] > 2) | (df["review_scores_location"].isnull())]
    df = df[(df["review_scores_checkin"] > 2) | (df["review_scores_checkin"].isnull())]
    df = df[(df["review_scores_communication"] > 2) | (df["review_scores_communication"].isnull())]
    df = df[(df["review_scores_value"] > 2) | (df["review_scores_value"].isnull())]
    df = df[(df["number_of_reviews"] < 300) | (df["number_of_reviews"].isnull())]
    return df

In [None]:
# for col in cur.columns:
#     if isinstance(cur[col][0], str) :
#         print("----------%s----------"%col)
#         print(cur[col].value_counts(dropna=False))

In [6]:
# 进行feature tranformation
def engineer_features(df):
    # transform host_since
    since = pd.to_datetime(df["host_since"]).dt
    df = df.drop("host_since", axis = 1)
    df["since_year"] = since.year
    df["since_month"] = since.month
    df["since_day"] = since.day
    # transform binary columns
    tfmap = {"f": 0, "t": 1}
    df["host_is_superhost"] = df["host_is_superhost"].fillna("f").map(tfmap)
    df["host_identity_verified"] = df["host_identity_verified"].fillna("f").map(tfmap)
    df["require_guest_phone_verification"] = df["require_guest_phone_verification"].map(tfmap)
    df["instant_bookable"] = df["instant_bookable"].map(tfmap)
    df["host_has_profile_pic"] = df["host_has_profile_pic"].fillna("f").map(tfmap)
    df["require_guest_profile_picture"] = df["require_guest_profile_picture"].fillna("f").map(tfmap)
    
    # transform amenities
    df["amenities"] = airbnb["amenities"].str.replace("{", "", regex = False)\
                                             .replace("}", "", regex = False)\
                                             .str.split(",").apply(len)
    df["transit"] = df["transit"].apply(lambda x: 0 if pd.isnull(x) else 1)
    # transform property types
    ptmap = {"Villa":"Other",
             "Bungalow":"Other",
             "Tiny house":"Other",
             "Aparthotel":"Other",
             "Boat":"Other",
             "Camper/RV":"Other",
             "Tent":"Other",
             "Cottage":"Other",
             "Houseboat":"Other",
             "Cabin":"Other",
             "Chalet":"Other",
             "Timeshare":"Other",
             "Train":"Other",
             "Island":"Other",
             "Casa particular (Cuba)":"Other"}
    df["property_type"].replace(ptmap, inplace = True)
    # transforom cancellation policy
#     cpmap = {"super_strict_60": "strict",
#              "super_strict_30": "strict",
#              "strict": "strict",
#              "long_term": "strict"}
#     df["cancellation_policy"].replace(cpmap, inplace = True)
    # na
    return df

In [None]:
engineer_features(airbnb)

In [None]:
if np.nan:
    print(1)

In [7]:
def fill_missing(df):
    df["bathrooms"] = df.groupby("room_type")["bathrooms"].transform(lambda x: x.fillna(mode(x).mode[0]))
    df["bedrooms"] = df.groupby("accommodates")["bedrooms"].transform(lambda x: x.fillna(mode(x).mode[0]))
    df["beds"] = df.groupby("accommodates")["beds"].transform(lambda x: x.fillna(mode(x).mode[0]))
    df["since_year"] = df["since_year"].fillna(mode(df["since_year"]).mode[0])
    df["since_month"] = df["since_month"].fillna(mode(df["since_month"]).mode[0])
    df["since_day"] = df["since_day"].fillna(mode(df["since_day"]).mode[0])
    df["market"] = df["market"].fillna(mode(df["market"]).mode[0])
    tofill = ["review_scores_rating",
    "review_scores_accuracy",
    "review_scores_cleanliness",
    "review_scores_checkin",
    "review_scores_communication",
    "review_scores_location",
    "review_scores_value"        
    ]
    for i in tofill:
        df[i] = df[i].fillna(np.mean(df[i]))
    df["reviews_per_month"] = df["reviews_per_month"].fillna(0)
    return df

In [8]:
def pre_transformation(df):
    res = drop_initial(df)
    res = to_num(res)
    res = remove_outliers(res)
    res = engineer_features(res)
    res = fill_missing(res)
    return res

In [None]:
pre_transformation(airbnb)

In [None]:
pre_transformation(airbnb)

In [9]:
pt = pre_transformation(airbnb)

In [11]:
import smogn

In [40]:
cleaned = smogn.smoter(pt.reset_index(), "price")

dist_matrix: 100%|###############################################################| 5092/5092 [8:05:06<00:00,  5.72s/it]
synth_matrix: 100%|################################################################| 5092/5092 [01:15<00:00, 67.26it/s]
r_index: 100%|####################################################################| 1366/1366 [00:10<00:00, 131.87it/s]


In [44]:
cleaned.to_csv("data/cleaned.csv")

In [12]:
# Column transformer
categorical = ["neighbourhood_cleansed", "neighbourhood_group_cleansed", "property_type", "room_type", "bed_type", "cancellation_policy",
              "market"]
std= ["accommodates", "bathrooms", "bedrooms", "beds", "amenities", "guests_included",
      "extra_people", "number_of_reviews", "review_scores_rating", "review_scores_accuracy",
     "review_scores_cleanliness", "review_scores_checkin", "review_scores_communication",
     "review_scores_location", "review_scores_value", "calculated_host_listings_count","since_year",
     "since_month", "since_day"]
clt = ColumnTransformer(
    transformers=[
        ("one-hot", OneHotEncoder(handle_unknown="ignore", sparse=False), categorical),
        ("standardization", StandardScaler(), std)
    ],
    remainder="passthrough"
)
clt.fit(pt.drop("price", axis = 1))



ColumnTransformer(remainder='passthrough',
                  transformers=[('one-hot',
                                 OneHotEncoder(handle_unknown='ignore',
                                               sparse=False),
                                 ['neighbourhood_cleansed',
                                  'neighbourhood_group_cleansed',
                                  'property_type', 'room_type', 'bed_type',
                                  'cancellation_policy', 'market']),
                                ('standardization', StandardScaler(),
                                 ['accommodates', 'bathrooms', 'bedrooms',
                                  'beds', 'amenities', 'guests_included',
                                  'extra_people', 'number_of_reviews',
                                  'review_scores_rating',
                                  'review_scores_accuracy',
                                  'review_scores_cleanliness',
                                  're

In [13]:
def preprocessing(df):
    pt = pre_transformation(df)
    try:
        pt.drop("price", axis = 1)
    except:
        return clt.transform(pt)
    X = pt.drop("price", axis = 1)
    y = pt.price
    res = clt.transform(X)
    return res, y

In [28]:
data = pd.DataFrame(preprocessing(airbnb)[0])
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,283,284,285,286,287,288,289,290,291,292
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.112069,-0.400674,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.59
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.705233,-0.173874,1.0,0.0,1.0,1.0,1.0,0.0,0.0,2.47
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.613885,-0.514075,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.89
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.613885,-1.421277,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.00
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.506302,-0.514075,1.0,1.0,1.0,1.0,0.0,0.0,0.0,3.14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33278,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.203418,0.279727,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.44
33279,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.402350,0.619928,1.0,0.0,1.0,1.0,0.0,0.0,0.0,2.28
33280,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.506302,-1.421277,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.00
33281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.099466,-1.081076,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.10


In [34]:
data["PRICE"] = preprocessing(airbnb)[1]
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,285,286,287,288,289,290,291,292,price,PRICE
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.59,145.0,145.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,1.0,0.0,0.0,2.47,175.0,175.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.89,180.0,180.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.00,42.0,42.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,3.14,80.0,80.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33278,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.44,75.0,75.0
33279,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,2.28,400.0,400.0
33280,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.00,350.0,350.0
33281,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.10,595.0,595.0


In [38]:
smogn.smoter(data.reset_index(), 'PRICE')

294

# MODEL FITTING

In [None]:
noutliers = drop_initial(airbnb)
noutliers = to_num(noutliers)
noutliers = engineer_features(noutliers)
noutliers = fill_missing(noutliers)
# Column transformer
categorical = ["neighbourhood_cleansed", "neighbourhood_group_cleansed", "property_type", "room_type", "bed_type", "cancellation_policy"]
clt = ColumnTransformer(
    transformers=[
        ("one-hot", OneHotEncoder(handle_unknown="ignore", sparse=False), categorical),
    ],
    remainder="passthrough"
)
clt.fit(noutliers.drop("price", axis = 1))
X_no = clt.transform(noutliers.drop("price", axis = 1))
y_no = noutliers.price

In [None]:
from sklearn import metrics
import xgboost as xgb
import lightgbm as lgb
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor


In [None]:

best_depth = [5,6,7,8, 9]
best_min_cw = [4,5,6]
res = []
for i in best_depth:
    for j in best_depth:
        xgbr = xgb.XGBRegressor(max_depth=i, learning_rate=0.1, n_estimators=100, min_child_weight = j)
        res.append([i, j, -(cross_val_score(xgbr, X_no, y_no, cv = 5, scoring="neg_root_mean_squared_error").mean())])


In [None]:
res

In [None]:

res = []
best_max_dp = [10,11,12,13]
best_min_data_in_leaf = [16, 17, 18,19, 20]
num_leaves = [50,55,60, 65, 70]
for i in best_min_data_in_leaf:
    for j in num_leaves:
        for k in best_max_dp:
            gbm = lgb.LGBMRegressor(max_depth=k, min_data_in_leaf = i, num_leaves = j)
            res.append([i, j,k, -(cross_val_score(gbm, X, y, cv = 5, scoring="neg_root_mean_squared_error").mean())])

In [None]:
best = 1000
best1 = 0
for i in res:
    if i[-1] < best:
        best = i[-1]
        best1 = i
best1

In [None]:

scr = []
alpha = []
for i in np.linspace(5.7, 5.8, 10):
    ridge = Ridge(alpha = i)
    score = -(cross_val_score(ridge, X_no, y_no, cv = 5, scoring="neg_root_mean_squared_error").mean())
    scr.append(score)
    alpha.append(i)
plt.scatter(x=alpha, y = scr)
alpha

In [None]:

depth = [2,3,4]
res = []
for d in depth:
    rdf = RandomForestRegressor(n_estimators=50, max_depth=14)
    res.append(-(cross_val_score(rdf, X, y, cv = 5, scoring="neg_root_mean_squared_error").mean()))
plt.scatter(depth, res)

In [None]:

ada = AdaBoostRegressor(base_estimator=Ridge(alpha=30), n_estimators=10, learning_rate=0.5)
-(cross_val_score(ada, X, y, cv = 5, scoring="neg_root_mean_squared_error").mean())

In [None]:
plt.scatter(x=ada.fit(X, y).predict(X), y=y)
plt.plot([0, 2000], [0, 2000], marker = "o", zorder = 3, color="red")

In [None]:
def pred_plot(model):
    model.fit(X, y)
    plt.scatter(x=model.predict(X), y = y)
    plt.plot([0, 2000], [0, 2000], marker = "o", zorder = 3, color="red")

In [None]:
pred_plot(RandomForestRegressor(n_estimators=50, max_depth=14))


In [None]:

pred_plot(xgb.XGBRegressor(max_depth=6, learning_rate=0.1, n_estimators=100, min_child_weight = 5))

In [None]:
mod.predict(X)

In [None]:
plt.scatter(x=mod.predict(X), y = y)
plt.plot([0, 2000], [0, 2000], marker = "o", zorder = 3, color="red")

In [None]:
preprocessing(test)

In [None]:
final = pd.read_csv("data/test.csv")
final.head()

In [None]:

X, y = preprocessing(airbnb)

In [None]:
test = test_preprocessing(final)

In [None]:
import xgboost as xgb
xgbr = xgb.XGBRegressor(max_depth=6, learning_rate=0.1, n_estimators=100, min_child_weight = 5)
mod = xgbr.fit(X, y)

In [None]:
res = mod.predict(test)
res

In [None]:
test = drop_initial(final)
test = to_num(test)
test = engineer_features(test)
test = clt.transform(test)

In [None]:
def to_csv(model, df = final):
    test = drop_initial(final)
    test = to_num(test)
    test = engineer_features(test)
    test = fill_missing(test)
    test = clt.transform(test)
    prediction = final[["id"]].set_index("id")
    prediction["Predicted"] = model.predict(test)
    prediction.to_csv("prediction.csv")

In [None]:
to_csv()

In [None]:
prediction = final[["id"]].set_index("id")
prediction["Predicted"] = res
prediction.to_csv("data/prediction.csv")

In [41]:
test = drop_initial(final)
test = to_num(test)
test = engineer_features(test)
test = fill_missing(test)
test = clt.transform(test)
prediction = final[["id"]].set_index("id")
ridge_pred = Ridge(alpha = 5.75).fit(X, y).predict(test)*0.2
lgb_pred=lgb.LGBMRegressor(max_depth=17, min_data_in_leaf = 23).fit(X, y).predict(test)*0.4
xgb_pred =xgb.XGBRegressor(max_depth=7, learning_rate=0.1, n_estimators=100, min_child_weight = 5).fit(X, y).predict(test)*0.4
prediction["Predicted"] = lgb_pred+xgb_pred+ridge_pred
prediction.to_csv("output/prediction.csv")

NameError: name 'final' is not defined

In [None]:
final

In [None]:
Ridge().fit(X_no, y_no).predict(test)

In [None]:
drop_initial(final).isnull().sum()

In [None]:
def tune_lgbm():
    res = []
    best_max_dp = [8, 12, 16]
    best_min_data_in_leaf = [10, 13, 16, 19, 22]
    num_leaves = [50, 60, 70, 80]
    for i in best_min_data_in_leaf:
        for j in num_leaves:
            for k in best_max_dp:
                gbm = lgb.LGBMRegressor(max_depth=k, min_data_in_leaf = i, num_leaves = j)
                res.append([i, j,k, -(cross_val_score(gbm, X, y, cv = 5, scoring="neg_root_mean_squared_error").mean())])