In [385]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
from pathlib import Path

BASE_DIR = Path.cwd().parent

csv_path = os.path.join(BASE_DIR, "csv", "houses_to_rent.csv")
df = pd.read_csv(csv_path)
df.head()

Unnamed: 0.1,Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa,rent amount,property tax,fire insurance,total
0,0,1,240,3,3,4,-,acept,furnished,R$0,"R$8,000","R$1,000",R$121,"R$9,121"
1,1,0,64,2,1,1,10,acept,not furnished,R$540,R$820,R$122,R$11,"R$1,493"
2,2,1,443,5,5,4,3,acept,furnished,"R$4,172","R$7,000","R$1,417",R$89,"R$12,680"
3,3,1,73,2,2,1,12,acept,not furnished,R$700,"R$1,250",R$150,R$16,"R$2,116"
4,4,1,19,1,1,0,-,not acept,not furnished,R$0,"R$1,200",R$41,R$16,"R$1,257"


In [386]:
df = df.drop(df.columns[0], axis=1)

In [387]:
df["floor"] = df["floor"].replace("-", 0)
df["animal"] = df["animal"].replace("not acept", 0)
df["animal"] = df["animal"].replace("acept", 1)
df["furniture"] = df["furniture"].replace("furnished", 1)
df["furniture"] = df["furniture"].replace("not furnished", 0)

In [388]:
columns = [
    "hoa",
    "rent amount",
    "property tax",
    "fire insurance",
    "total",
]

for column in columns:
    df[column] = df[column].str.replace("R$", "", regex=False).str.strip()
    df[column] = df[column].str.replace(",", ".", regex=False).str.strip()

In [389]:
df["hoa"] = df["hoa"].replace(to_replace="Sem info", value="0")
df["hoa"] = df["hoa"].replace(to_replace="Incluso", value="0")

df["property tax"] = df["property tax"].replace(to_replace="Incluso", value="0")

In [390]:
df.isin(['Incluso']).any()

city              False
area              False
rooms             False
bathroom          False
parking spaces    False
floor             False
animal            False
furniture         False
hoa               False
rent amount       False
property tax      False
fire insurance    False
total             False
dtype: bool

In [391]:
df = df.astype(dtype=np.float64)

In [392]:
y = df["city"]
X = df.drop("city", axis=1)

In [393]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)


In [394]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.009351,0.222222,0.222222,0.333333,0.000000,1.0,1.0,0.000000,0.007014,0.001002,0.175074,0.008136
1,0.002195,0.111111,0.000000,0.083333,0.101010,1.0,0.0,0.541082,0.820641,0.122244,0.011869,0.000493
2,0.017604,0.444444,0.444444,0.333333,0.030303,1.0,1.0,0.004180,0.006012,0.001420,0.127596,0.011702
3,0.002561,0.111111,0.111111,0.083333,0.121212,1.0,0.0,0.701403,0.000251,0.150301,0.019288,0.001117
4,0.000366,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000200,0.041082,0.019288,0.000257
...,...,...,...,...,...,...,...,...,...,...,...,...
6075,0.001626,0.111111,0.000000,0.083333,0.020202,1.0,0.0,0.420842,0.000150,0.000000,0.017804,0.000585
6076,0.003009,0.111111,0.111111,0.083333,0.161616,0.0,1.0,0.769539,0.001904,0.063126,0.050445,0.002773
6077,0.001545,0.000000,0.000000,0.000000,0.131313,1.0,0.0,0.250501,0.950902,0.042084,0.014837,0.000255
6078,0.006099,0.222222,0.111111,0.166667,0.000000,0.0,0.0,0.000000,0.002505,0.250501,0.074184,0.002808


In [395]:
print(type(X))
print(type(y))

X =pd.DataFrame(X)

<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


In [396]:
df =pd.concat([X, y], axis=1)

In [397]:
y = df["city"]
X = df.drop("city", axis=1)

In [398]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.8, random_state=42
)



In [399]:
from pycaret.classification import *


setup(data=X_train, target=y_train, session_id=42)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,city
2,Target type,Binary
3,Original data shape,"(4864, 13)"
4,Transformed data shape,"(4864, 13)"
5,Transformed train set shape,"(3404, 13)"
6,Transformed test set shape,"(1460, 13)"
7,Numeric features,12
8,Preprocess,True
9,Imputation type,simple


<pycaret.classification.oop.ClassificationExperiment at 0x322bf3760>

In [400]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.9098,0.8996,0.9833,0.9181,0.9496,0.5266,0.5582,0.271
xgboost,Extreme Gradient Boosting,0.9095,0.8906,0.9724,0.9265,0.9489,0.5571,0.5712,0.012
rf,Random Forest Classifier,0.906,0.8913,0.984,0.9138,0.9476,0.4971,0.5334,0.055
lightgbm,Light Gradient Boosting Machine,0.9054,0.8938,0.9721,0.9226,0.9467,0.531,0.5475,0.492
et,Extra Trees Classifier,0.9051,0.8907,0.983,0.9137,0.9471,0.4945,0.5283,0.031
gbc,Gradient Boosting Classifier,0.8995,0.8716,0.9823,0.9088,0.9441,0.4564,0.4965,0.053
ada,Ada Boost Classifier,0.8969,0.8442,0.9827,0.906,0.9427,0.434,0.4771,0.018
svm,SVM - Linear Kernel,0.889,0.7206,0.9949,0.8896,0.9393,0.3084,0.3988,0.006
ridge,Ridge Classifier,0.8878,0.7899,0.9935,0.8895,0.9386,0.305,0.391,0.005
knn,K Neighbors Classifier,0.8863,0.7918,0.9704,0.9049,0.9365,0.4002,0.4245,0.01


<catboost.core.CatBoostClassifier at 0x31f7a5430>