#### Standard imports

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
plt.style.use("ggplot")

# Modeling imports
from sklearn.linear_model import LinearRegression, RANSACRegressor, HuberRegressor,TheilSenRegressor, Lasso, Ridge
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, RobustScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error

#### Load cleaned data

In [4]:
# load clean data
train_df = pd.read_csv('../data/train_cleaned.csv')
store_df = pd.read_csv('../data/store_711_en.csv')
bkk_zone_df = pd.read_csv('../data/bkk_zone.csv')
store_zone = pd.merge(left=store_df, right=bkk_zone_df)

In [5]:
# map Bangkok district to zone name
train_zone_df = pd.merge(left=train_df, right=bkk_zone_df.rename({'district_en': 'district'}, axis=1))

### Model 1. Predict Housing Price

#### Get dummy columns

In [8]:
train_dummy_df = pd.get_dummies(columns=['property_type', 'province', 'district'], data=train_df, drop_first=True)

In [9]:
train_dummy_df.sort_index(inplace=True)

#### Train-Test split data

Select features use for training and split data for training and test with ratio 80/20

In [12]:
features = [c for c in train_dummy_df.columns if re.search(r'bedrooms|baths|floor_area|floor_level|property|province_|district_', c)]
X = train_dummy_df[features]
y = train_dummy_df['price']

In [13]:
# split data for training and test with ratio 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.8)

In [14]:
def evaluate_model(model, X, X_train, X_test, y, y_train, y_test, y_pred):
    score = cross_val_score(model, X, y, cv=5).mean()
    # show model performance
    name = str(model)
    print(f"Cross Value Score of {name}: {score:.4f}")
    print(f"R^2 of {name}: {r2_score(y_test, y_pred):.4f}")
    print(f"RMSE of {name}: {root_mean_squared_error(y_test, y_pred):.4f}")

#### Training models & compare result

In [16]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

# show model performance
evaluate_model(lr, X, X_train, X_test, y, y_train, y_test, y_pred)

Cross Value Score of LinearRegression(): 0.6253
R^2 of LinearRegression(): 0.6303
RMSE of LinearRegression(): 1332687.9861


In [17]:
# scale X values
sc = RobustScaler()
X_sc = sc.fit_transform(X)

In [18]:
lasso = Lasso(alpha=0.01, max_iter=1000, tol=0.1)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)

evaluate_model(lasso, X_sc, X_train, X_test, y, y_train, y_test, y_pred)

  model = cd_fast.enet_coordinate_descent(


Cross Value Score of Lasso(alpha=0.01, tol=0.1): 0.6253
R^2 of Lasso(alpha=0.01, tol=0.1): 0.6303
RMSE of Lasso(alpha=0.01, tol=0.1): 1332688.0347


### Model 2. Improve predict price with number of nearby 7-11 store

##### Feature Engineering

In [21]:
# filter only property type Condo in Bangkok
train_store_df = train_df[(train_df['province'] == 'Bangkok') & (train_df['property_type'] == 'Condo')]

# create dummy for property_type, province and district
train_store_df = pd.get_dummies(columns=['property_type', 'province', 'district'], data=train_store_df, drop_first=True)

##### Find nearby 7-11 within 1 km. distance from Condo in Bangkok

In [23]:
# distance for latitude and longitude of 0.01 degress ~ 1 km.

# round latitude and longitude for training data
train_store_df['round_longitude'] = train_store_df['longitude'].map(lambda x: round(x, 2))
train_store_df['round_latitude'] = train_store_df['latitude'].map(lambda x: round(x, 2))

# round latitude and longitude for 7-11 store data
store_df['round_longitude'] = store_df['longitude'].map(lambda x: round(x, 2))
store_df['round_latitude'] = store_df['latitude'].map(lambda x: round(x, 2))

# do grid search 7-11 store within range ~1 km.
for lat in np.arange(round(train_store_df['latitude'].min(), 2), round(train_store_df['latitude'].max(),2), 0.01):
    for lng in np.arange(round(train_store_df['longitude'].min(), 2), round(train_store_df['longitude'].max(),2), 0.01):
        appx_lat = round(lat,2)
        appx_lng = round(lng,2)
        #print(appx_lat, appx_lng)
        house_df = train_store_df[(train_store_df['round_longitude'] == appx_lng) & (train_store_df['round_latitude'] == appx_lat)]
        found_store_df = store_df[(store_df['round_longitude'] == appx_lng) & (store_df['round_latitude'] == appx_lat)]
        if len(house_df['id']) and len(found_store_df):
            #print(appx_lat, appx_lng, set(house_df['id']), len(found_store_df))
            train_store_df.loc[(train_store_df['round_longitude'] == appx_lng) & (train_store_df['round_latitude'] == appx_lat), 'nearby_711_store_num'] \
                 = len(found_store_df)


#### Train-Test split data

In [25]:
train_store_df['nearby_711_store_num'] = train_store_df['nearby_711_store_num'].fillna(0)
features = [c for c in train_store_df.columns if re.search(r'nearby_711_store_num|total_units|bedrooms|baths|floor_area|floor_level|property|province_|district_', c)]
X = train_store_df[features]
y = train_store_df['price']

In [26]:
# split data for training and test with ratio 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.8)

#### Training models & compare result

In [28]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

evaluate_model(lr, X, X_train, X_test, y, y_train, y_test, y_pred)

Cross Value Score of LinearRegression(): 0.6636
R^2 of LinearRegression(): 0.6542
RMSE of LinearRegression(): 1302903.3774


In [29]:
sc = RobustScaler()
X_sc = sc.fit_transform(X)

In [30]:
lasso = Lasso(alpha=0.01, max_iter=5000, fit_intercept=False, tol=0.1)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)

evaluate_model(lasso, X_sc, X_train, X_test, y, y_train, y_test, y_pred)

Cross Value Score of Lasso(alpha=0.01, fit_intercept=False, max_iter=5000, tol=0.1): 0.6618
R^2 of Lasso(alpha=0.01, fit_intercept=False, max_iter=5000, tol=0.1): 0.4760
RMSE of Lasso(alpha=0.01, fit_intercept=False, max_iter=5000, tol=0.1): 1604018.5605


### Model 3. Predict under served 7-11 store

In [32]:
features = [c for c in train_store_df.columns if re.search(r'total_units|bedrooms|baths|floor_area|floor_level|property|province_|district_', c)]
X = train_store_df[features]
y = train_store_df['nearby_711_store_num'].fillna(0)

In [33]:
# split data for training and test with ratio 80/20
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.8)

In [34]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
evaluate_model(lr, X_sc, X_train, X_test, y, y_train, y_test, y_pred)

Cross Value Score of LinearRegression(): 1.0000
R^2 of LinearRegression(): 0.4740
RMSE of LinearRegression(): 4.3522


In [35]:
sc = RobustScaler()
X_sc = sc.fit_transform(X)

In [36]:
lasso = Lasso(alpha=0.01, max_iter=1000, tol=0.1)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)

evaluate_model(lasso, X_sc, X_train, X_test, y, y_train, y_test, y_pred)

Cross Value Score of Lasso(alpha=0.01, tol=0.1): 0.4343
R^2 of Lasso(alpha=0.01, tol=0.1): 0.4646
RMSE of Lasso(alpha=0.01, tol=0.1): 4.3911
