In [3]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv('./data/cleaned_data.csv')

In [5]:
replace_dict = {
    'Kot 2': '-2. Kat',
    'Kot 1': '-1. Kat',
    'Yüksek Giriş': '1. Kat',
    'Ara Kat': '3. Kat',
    'En Üst Kat': '5. Kat',
    'Bahçe Katı': '0. Kat',
    'Yarı Bodrum': '0. Kat',
    'Bodrum': '0. Kat',
    'Kot 3': '-3. Kat',
    'Çatı Katı': '5. Kat',
    'Zemin': '0. Kat',
    'Giriş Katı': '0. Kat',
    'Villa Katı': '0. Kat',
    '21 ve üzeri': '21. Kat',
    'Bodrum ve Zemin': '0. Kat',
    'Asma Kat': '1. Kat',
    'Tripleks': '0. Kat',
    'Teras Katı': '5. Kat',
    'nan': '2. Kat',
}

df['floor'] = df['floor'].replace(replace_dict.keys(), replace_dict.values()).astype(str)
# df['floor'] = df['floor'].apply(lambda x: x.split('.')[0]).astype(int)

In [6]:
columns = df.select_dtypes(include=[np.number]).columns
min_values = []
max_values = []
for column in columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    min_value = Q1 - 1.5 * IQR
    max_value = Q3 + 1.5 * IQR
    min_values.append(min_value)
    max_values.append(max_value)
    print(f"Column: {column}, min: {min_value}, max: {max_value}")

# source: https://github.com/canbula/KiraTahmini/blob/master/istatistiksel_inceleme.ipynb

Column: price, min: 5000.0, max: 21000.0
Column: room, min: 0.5, max: 4.5
Column: living_room, min: 1.0, max: 1.0
Column: area, min: 45.0, max: 245.0
Column: age, min: -20.0, max: 36.0


In [7]:
for i, column in enumerate(columns):
    df = df[(df[column] >= min_values[i]) & (df[column] <= max_values[i])]

# source: https://github.com/canbula/KiraTahmini/blob/master/istatistiksel_inceleme.ipynb

In [8]:
# df = df[df['price'] >= 3000]
df.describe()

Unnamed: 0,price,room,living_room,area,age
count,1154.0,1154.0,1154.0,1154.0,1154.0
mean,12908.461005,2.721837,1.0,140.672444,9.433276
std,3109.757986,0.696734,0.0,35.084537,8.980241
min,5000.0,1.0,1.0,50.0,0.0
25%,11000.0,2.0,1.0,120.0,2.0
50%,13000.0,3.0,1.0,145.0,6.0
75%,15000.0,3.0,1.0,165.0,15.0
max,21000.0,4.0,1.0,240.0,35.0


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1154 entries, 1 to 1300
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   price         1154 non-null   int64 
 1   room          1154 non-null   int64 
 2   living_room   1154 non-null   int64 
 3   area          1154 non-null   int64 
 4   age           1154 non-null   int64 
 5   floor         1154 non-null   object
 6   district      1154 non-null   object
 7   neighborhood  1154 non-null   object
dtypes: int64(5), object(3)
memory usage: 81.1+ KB


In [11]:
df['district'] = df['district'].astype('category')
df['neighborhood'] = df['neighborhood'].astype('category')
df['room'] = df['room'].astype('int')
df['living_room'] = df['living_room'].astype('int')
df['area'] = df['area'].astype('int')
df['age'] = df['age'].astype('int')
df['floor'] = df['floor'].astype('category')
df['price'] = df['price'].astype('int')

In [12]:
categorical_features = ['district', 'neighborhood', 'floor']
numerical_features = ['room', 'living_room', 'area', 'age']

In [13]:
full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
])

In [14]:
X = df.drop('price', axis=1)
y = df['price']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# from sklearn.model_selection import GridSearchCV

# # Define hyperparameter grid
# param_grid = {
#     'model__n_estimators': [100, 200, 400, 600],
#     'model__max_depth': [3, 5, 7],
#     'model__min_samples_split': [2, 5, 10],
#     'model__learning_rate': [0.01, 0.05, 0.1, 0.2],
#     'model__loss': ['squared_error', 'absolute_error']
# }

# # Set up pipeline and GridSearchCV
# model = Pipeline([
#     ('preparation', full_pipeline),
#     ('model', GradientBoostingRegressor(random_state=42))
# ])

# grid_search = GridSearchCV(
#     estimator=model,
#     param_grid=param_grid,
#     scoring='r2',  # Optimize for r^2
#     cv=5,
#     verbose=2,
#     n_jobs=-1
# )

# # Fit GridSearchCV
# grid_search.fit(X_train, y_train)

# # Print best parameters and score
# print("Best Parameters:", grid_search.best_params_)
# print("Best Score:", grid_search.best_score_)

# # Use the best model for predictions
# best_model = grid_search.best_estimator_

In [17]:
model = Pipeline([
    ('preparation', full_pipeline),
    # ('model', LinearRegression())
    # ('model', GradientBoostingRegressor(n_estimators=400, max_depth=5, min_samples_split=2, learning_rate=0.1, loss='squared_error'))
    # ('model', GradientBoostingRegressor(n_estimators=100, max_depth=3, min_samples_split=10, learning_rate=0.2, loss='squared_error'))
    ('model', GradientBoostingRegressor(n_estimators=600, max_depth=7, min_samples_split=10, learning_rate=0.01, loss='squared_error'))
    # ('model', RandomForestRegressor(random_state=1))
])

In [18]:
model.fit(X_train, y_train)

In [19]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [20]:
from sklearn.model_selection import cross_val_score

# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(model, X, y,
                              cv=5,
                              scoring='neg_mean_absolute_error')

print("MAE scores:\n", scores)

MAE scores:
 [1739.78012464 1722.60943137 1606.67205981 1694.39033584 1543.83215528]


In [21]:
print("Average MAE score (across experiments):")
print(scores.mean())

Average MAE score (across experiments):
1661.456821388369


In [22]:
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R^2: {r2}")

MSE: 4177085.2966866842
RMSE: 2043.7918917264262
R^2: 0.5615753316901472


In [27]:
# feature_importances = model.named_steps['model'].coef_
# print(len(feature_importances))
# print(feature_importances)

In [28]:
# print("Numerical Features")
# for i in range(len(numerical_features)):
#     print(numerical_features[i], feature_importances[i])

In [875]:
# print("Categorical Features")
# for i in range(len(categorical_features)):
#     for j in range(len(model.named_steps['preparation'].transformers_[1][1].categories_[i])):
#         print(model.named_steps['preparation'].transformers_[1][1].categories_[i][j], feature_importances[len(numerical_features) + j])

In [29]:
new_data = pd.DataFrame({
    'district': ['Talas'],
    'neighborhood': ['Mevlana'],
    'room': [3],
    'living_room': [1],
    'area': [150],
    'age': [18],
    'floor': ['4. Kat']
})

print(model.predict(new_data))

[10801.20110874]


In [30]:
print(df[(df['district'] == 'Talas') & (df['neighborhood'] == 'Mevlana')].describe())

              price        room  living_room        area         age
count    252.000000  252.000000        252.0  252.000000  252.000000
mean   13798.202381    2.599206          1.0  137.757937    7.682540
std     2557.685464    0.709974          0.0   36.006210    5.575488
min     8500.000000    1.000000          1.0   55.000000    0.000000
25%    12000.000000    2.000000          1.0  110.000000    4.000000
50%    13500.000000    3.000000          1.0  150.000000    5.000000
75%    15000.000000    3.000000          1.0  165.000000   11.000000
max    21000.000000    4.000000          1.0  220.000000   25.000000


In [31]:
def tolerance_r2(y_true, y_pred, tolerance):
    residuals = y_pred - y_true
    residuals[np.abs(residuals) <= tolerance] = 0
    ssr = np.sum(residuals**2)
    sst = np.sum((y_true - np.mean(y_true))**2)
    return 1 - (ssr / sst)

def tolerance_percentage_r2(y_true, y_pred, tolerance):
    residuals = y_pred - y_true
    residuals[(np.abs(residuals) / y_true) <= tolerance] = 0
    ssr = np.sum(residuals**2)
    sst = np.sum((y_true - np.mean(y_true))**2)
    return 1 - (ssr / sst)

In [32]:
print(r2_score(y_test, y_pred))
print(tolerance_r2(y_test, y_pred, 2000))
print(tolerance_percentage_r2(y_test, y_pred, 0.10))

0.5615753316901472
0.6359474827392582
0.5855116423215574
