**Random Forest Classifier**

In [220]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix

In [221]:
df=pd.read_csv('/content/data_cleand.csv')

In [222]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3586 entries, 0 to 3585
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   city          3586 non-null   object
 1   district      3586 non-null   object
 2   neighborhood  3586 non-null   object
 3   room          3586 non-null   int64 
 4   living-room   3586 non-null   int64 
 5   area          3586 non-null   int64 
 6   age           3586 non-null   int64 
 7   floor         3586 non-null   int64 
 8   price         3586 non-null   int64 
dtypes: int64(6), object(3)
memory usage: 252.3+ KB
None


In [223]:
df['city'] = df['city'].astype('category')
df['district'] = df['district'].astype('category')
df['neighborhood'] = df['neighborhood'].astype('category')
df['room'] = df['room'].astype('int')
df['living-room'] = df['living-room'].astype('int')
df['area'] = df['area'].astype('int')
df['age'] = df['age'].astype('int')
df['floor'] = df['floor'].astype('int')
df['price'] = df['price'].astype('int')

In [224]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3586 entries, 0 to 3585
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   city          3586 non-null   category
 1   district      3586 non-null   category
 2   neighborhood  3586 non-null   category
 3   room          3586 non-null   int64   
 4   living-room   3586 non-null   int64   
 5   area          3586 non-null   int64   
 6   age           3586 non-null   int64   
 7   floor         3586 non-null   int64   
 8   price         3586 non-null   int64   
dtypes: category(3), int64(6)
memory usage: 205.2 KB
None


In [226]:
categorical_features = ['city', 'district', 'neighborhood']
numerical_features = ['room', 'living-room', 'area', 'age', 'floor']

In [227]:
full_pipeline = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [228]:
X = df.drop('price', axis=1)
y = df['price']

In [230]:
bins = [x for x in range(0, 70000 ,10000)]
labels = [x for x in range(1, 7)]
print(bins)
print(labels)

[0, 10000, 20000, 30000, 40000, 50000, 60000]
[1, 2, 3, 4, 5, 6]


In [231]:
y=pd.cut(y, bins=bins, labels=labels)

In [232]:
print(y.unique())

[2, 3, 1, 4, 6, 5]
Categories (6, int64): [1 < 2 < 3 < 4 < 5 < 6]


In [233]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [234]:
model = Pipeline([
    ('preparation', full_pipeline),
    ('model', RandomForestClassifier(n_estimators=100))
])

In [235]:
model.fit(X_train, y_train)

In [236]:
y_pred = model.predict(X_test)

In [239]:
print(confusion_matrix(y_test, y_pred))
'''
2. ve 3. katogorideki dogruluklar daha iyi
uç degerlerde daha duşuk dogruluk oranı var
'''

[[  1  15   2   1   0   0]
 [  6 260  64   1   1   0]
 [  1  82 147  11   2   0]
 [  0  15  55  15   5   0]
 [  0   2  14   6   3   1]
 [  0   0   3   4   1   0]]


'\n2. ve 3. katogorideki dogruluklar daha iyi\nuç degerlerde daha duşuk dogruluk oranı var\n'

In [240]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.12      0.05      0.07        19
           2       0.70      0.78      0.74       332
           3       0.52      0.60      0.56       243
           4       0.39      0.17      0.23        90
           5       0.25      0.12      0.16        26
           6       0.00      0.00      0.00         8

    accuracy                           0.59       718
   macro avg       0.33      0.29      0.29       718
weighted avg       0.56      0.59      0.57       718

