In [1]:
import pandas as pd
import bs4
from bs4 import BeautifulSoup  
import csv
import numpy as np
import matplotlib.pyplot as plt      
import numpy as np
import sklearn
from sklearn import preprocessing, linear_model, model_selection
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn import linear_model, metrics, preprocessing
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import r2_score, f1_score 
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error , mean_absolute_error
import numpy as np
import seaborn as sns

In [2]:
### RandomForestRegressor() 

def load_dataset(df, label_column):
    y = df[label_column]
    X = df.drop(label_column, axis=1)
    return X, y


def data_split(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

def normalize_data(X_train, X_test):
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

def train_model(X_train_scaled, y_train):
    forest = RandomForestRegressor(max_depth=5, min_samples_split=2 ,n_estimators=100)
    forest.fit(X_train_scaled, y_train)
    return forest

def predict_model(X_test_scaled,y_test , forest):
    y_pred = forest.predict(X_test_scaled)
    result = pd.DataFrame({'Actual': y_test, 'Predicted':  np.round(y_pred)})
    result['difference'] =  result["Actual"] -  result["Predicted"] 
    print(f"Forest Score {forest.score(X_test_scaled,y_test)}")

    return y_pred, result

 
def evaluate_model(y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mae, r2

def tune_model(X_train_scaled, y_train):
    params = {'max_depth': [5, 10, 15], 'n_estimators': [50, 100, 200], 'min_samples_split': [2, 4, 6] }
    grid_search_cv = GridSearchCV(RandomForestRegressor(), params, n_jobs=-1, verbose=1, cv=7)
    grid_search_cv.fit(X_train_scaled, y_train)
    best_params = grid_search_cv.best_params_
    best_score = grid_search_cv.best_score_
    print(grid_search_cv.best_estimator_)
    return best_params, best_score

def return_best_model(X_train_scaled, X_test_scaled, y_train, y_test, best_params):
    reg = RandomForestRegressor(**best_params)
    reg.fit(X_train_scaled, y_train)
    y_pred = reg.predict(X_test_scaled)
    return reg, y_pred


def Remove_outliers(df):
    for col in df.select_dtypes(include=['float64','int']).columns:
        q1, q3 = np.percentile(df[col], [25, 75])
        iqr = q3 - q1
        lower_bound = q1 -(1.5 * iqr) 
        upper_bound = q3 +(1.5 * iqr)
        df = df[(df[col] > lower_bound) & (df[col] < upper_bound)]
    return df

def drop_non_numeric_and_na_cols(df):
    # Get the list of non-numeric columns
    df = df.dropna(subset=['Long','Lat','Rooms','Floor','Floors']).reset_index(drop=True)
    non_numeric_cols = list(df.select_dtypes(exclude=['number']).columns)
    
    # Get the list of columns that contain NaNs
    na_cols = list(df.columns[df.isna().any()])
    
    # Combine the two lists and drop the columns from the DataFrame
    cols_to_drop = list(set(non_numeric_cols) | set(na_cols))
    df = df.drop(cols_to_drop, axis=1)
    
    return df

In [None]:
# df = pd.read_csv("../Data/Real_Estate_TLV_Numric_Data.csv",index_col=0)
# df.drop(['AVG_SALARY'],axis =1 , inplace =True)
df = pd.read_csv("../Data/Nadlan_clean.csv",index_col=0)
df['Date'] = pd.to_datetime(df['Date'], format='%d.%m.%Y')
df['Year'] = df['Date'].dt.strftime('%Y')
df['Year'] = df['Year'].astype(int)

# df.drop(['NEIGHBORHOOD'],axis =1 , inplace =True)

df = df[df['Year'] < 2023]
df = df[df['Year'] > 2003]

df['AVG_ROOM_SIZE'] = (df["Size"] / df['Rooms']).round(1)

df = drop_non_numeric_and_na_cols(df)


df = Remove_outliers(df)

print(df.shape)
label_column = "Price"

X , y = load_dataset(df,label_column)
X_train, X_test, y_train, y_test = data_split(X, y)


#2. Normalize Data
X_train_scaled, X_test_scaled = normalize_data(X_train, X_test)

#3. Train Model
forest = train_model(X_train_scaled, y_train)

#4. Predict Model
y_pred , result = predict_model(X_test_scaled,y_test, forest)

#5. Evaluate Model
mae, r2 = evaluate_model(y_test, y_pred)

#6. Tune Model
best_params, best_score = tune_model(X_train_scaled, y_train)

#7. Return Best Model
forest, y_pred = return_best_model(X_train_scaled, X_test_scaled, y_train, y_test, best_params)

print("Best Parameters:", best_params)
print("Best Score:", best_score)
print("MAE:", mae) # mae = mean_absolute_error
print("R2:", r2)
X_train.hist()
plt.figure(figsize=(15,8))
sns.heatmap(X_train.corr(), annot =True ,cmap ='YlGnBu')
result

(50593, 15)
Forest Score 0.18128212451158
Fitting 7 folds for each of 27 candidates, totalling 189 fits


In [None]:
yad2_df = pd.read_csv("../Data/Real_Estate_TLV_YAD2_Numeric.csv",index_col=0)
yad2_df.drop(['neighborhood'],axis =1 , inplace =True)
yad2_df['year'] = 2022

In [None]:

def rename_yad2_df(df):
    df.rename(columns={'price':'DEALAMOUNT'}, inplace=True)
#     df.rename(columns={'neighborhood':'NEIGHBORHOOD'}, inplace=True)
    df.rename(columns={'buildingMR':'ASSETMETER'}, inplace=True)
    df.rename(columns={'TotalFloors':'BUILDINGFLOORS'}, inplace=True)
    df.rename(columns={'floor':'FLOOR'}, inplace=True)
    df.rename(columns={'buildyear':'BUILDINGYEAR'}, inplace=True)
    df.rename(columns={'year':'DATE'}, inplace=True)
    df.rename(columns={'rooms':'ROOMNUM'}, inplace=True)
#     df = df.reindex(columns=["DATE", "DEALAMOUNT", "BUILDINGYEAR", "BUILDINGYEAR", "BUILDINGFLOORS",
#                               "NEIGHBORHOOD", "ROOMNUM", "FLOOR", "ASSETMETER", "long", "lat", "Distance_From_Sea"])
    return df

def preprocess_dataframe(df):
    df = df.dropna()
    df = df.drop_duplicates()
    df.drop(['parking', 'balconies','shelter','on_pillars','storeroom','asset_classification','elevator','home_number'], axis=1 , inplace=True )
#     df['AVG_ROOM_SIZE'] = (df["buildingMR"] / df['rooms']).round(1)
    df = rename_yad2_df(df)
    
    df['BUILDINGFLOORS'] = df['BUILDINGFLOORS'].astype(int)

    
    
    # Remove columns with only one unique value
#     unique_counts = df.nunique()
#     cols_to_drop = unique_counts[unique_counts == 1].index
#     df = df.drop(cols_to_drop, axis=1)

    # Remove outliers
    for col in df.select_dtypes(include=['float64','int']).columns:
        q1, q3 = np.percentile(df[col], [25, 75])
        iqr = q3 - q1
        lower_bound = q1 -(1.5 * iqr) 
        upper_bound = q3 +(1.5 * iqr)
        df = df[(df[col] > lower_bound) & (df[col] < upper_bound)]
        
    return df


def recommend_affordable_apartments(df, model):
    # Normalize the data
    X, y = load_dataset(df, "DEALAMOUNT")
    
    # We dont need the the second params so we will call it '_'
    X_scaled, _ = normalize_data(X, X)
    
    
    # Predict prices using the trained model
    y_pred = model.predict(X_scaled)
    y_pred = y_pred * 1.28 # 0.22 % is the percentage of increase in the last year (2022)
    df["PREDICTED_PRICE"] = y_pred
    df["PREDICTED_PRICE"] = df["PREDICTED_PRICE"].astype(int)
    df['difference'] =  df["DEALAMOUNT"] -  df["PREDICTED_PRICE"] 
    
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    print(f'r2_score: {r2} , mae: {mae}')

    return df.sort_values(by="difference")

yad2_df = preprocess_dataframe(yad2_df)
yad2_df = yad2_df.reindex(columns=["DATE", "DEALAMOUNT", "BUILDINGYEAR", "BUILDINGFLOORS",
                               "ROOMNUM", "FLOOR", "ASSETMETER", "long", "lat", "Distance_From_Sea"]) #"NEIGHBORHOOD",

yad2_df = yad2_df[np.isfinite(yad2_df["ASSETMETER"] / yad2_df['ROOMNUM'])]
yad2_df['AVG_ROOM_SIZE'] = (yad2_df["ASSETMETER"] / yad2_df['ROOMNUM']).round(1)
yad2_df['AVG_ROOM_SIZE'] = yad2_df['AVG_ROOM_SIZE'].astype(int)

affordable_deals = recommend_affordable_apartments(yad2_df, forest)
affordable_deals

In [None]:
# affordable_deals = recommend_affordable_apartments(yad2_df, forest)
# affordable_deals

In [None]:
affordable_deals.sort_values(by="difference",  ascending=True )

affordable_deals = affordable_deals.reindex(columns=["DATE", "DEALAMOUNT",'PREDICTED_PRICE','difference', "BUILDINGYEAR", "BUILDINGFLOORS",
                               "ROOMNUM", "FLOOR", "ASSETMETER", "long", "lat", "Distance_From_Sea"])
#NEIGHBORHOOD
affordable_deals[0:130]
affordable_deals.shape

### 

In [None]:
yad2_origin = pd.read_csv("Data/Real_Estate_TLV_YAD2.csv",index_col=0)

def covert_data(df):
    df = df.dropna()
    df.loc[:, 'price'] = df['price'].str.replace('[^0-9]','',regex=True).astype(int)
    df.loc[:, 'buildingMR'] = df['buildingMR'].astype(int)
    df.loc[:, 'floor'] = df['floor'].astype(int)

    
    df.rename(columns={'price':'DEALAMOUNT'}, inplace=True)
    df.rename(columns={'buildingMR':'ASSETMETER'}, inplace=True)
    df.rename(columns={'floor':'FLOOR'}, inplace=True)
    return df

yad2_origin = covert_data(yad2_origin)
yad2_origin.shape


In [None]:
def merge_dataframes(df1, df2):
    new_df = df1.merge(df2, on=['ASSETMETER', 'DEALAMOUNT','FLOOR'], how='right')
    new_df = new_df.drop(['TotalFloors','home_number_2','ROOMNUM'], axis=1)
    new_df = new_df.dropna()
    return new_df

yad2_all = merge_dataframes(yad2_origin,affordable_deals)
yad2_all.shape

In [None]:
yad2_all = yad2_all.reindex(columns=["DATE", "DEALAMOUNT", 'PREDICTED_PRICE', 'difference','long_y','lat_y','lat_x', 'long_x'
                                    , 'street', 'neighborhood'
                                    , 'BUILDINGYEAR', 'BUILDINGFLOORS','home_number', 'item_id',
                                     'ASSETMETER', 'TotalFloors', 'asset_classification', 'rooms', 'FLOOR',
                                       'shelter', 'on_pillars', 'elevator', 'storeroom','parking', "Distance_From_Sea"])

yad2_all.sort_values(by="difference",  ascending=True )