In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score,recall_score, precision_score

In [None]:
data = pd.read_csv('/content/drive/MyDrive/file_find/data_extracted.csv')
data.head(10)

Unnamed: 0,n_rooms,n_toilets,area,x,y,distance_UBND,district,polistic,furniture,house_direct,balcony_direct,n_hospitals,n_schools,price,price_class
0,2,2,75.0,21.008852,105.743919,6.97,Nam Tu Liem,so do,day du,Dong - Bac,Dong - Nam,0,0,43.333333,5
1,3,2,100.0,21.084397,105.79236,4.33,Bac Tu Liem,so do,cao cap,Tay,Bac,1,5,39.0,4
2,3,3,102.0,21.002666,105.81543,4.17,Thanh Xuan,so do,cao cap,Tay - Bac,Dong - Nam,4,17,39.215686,4
3,3,3,103.0,21.002666,105.81543,4.17,Thanh Xuan,so do,cao cap,Tay - Bac,Dong - Nam,4,17,39.5,4
4,2,2,74.0,21.044233,105.767319,3.74,Bac Tu Liem,so do,day du,Bac,Dong - Nam,0,6,38.513514,4
5,2,2,86.0,21.016827,105.815224,2.81,Dong Da,hdmb,day du,Dong - Bac,Tay - Nam,4,25,77.906977,8
6,3,2,115.0,21.032419,105.83136,2.86,Ba Dinh,hdmb,day du,Tay - Nam,Nam,2,9,116.0,11
7,3,2,105.5,20.991955,105.785103,5.46,Nam Tu Liem,so do,day du,Tay - Bac,Dong - Bac,1,2,42.0,5
8,2,2,59.0,21.008852,105.743919,6.97,Nam Tu Liem,hdmb,nguyen_ban,Dong - Bac,Tay - Nam,0,0,38.135593,4
9,3,2,69.0,20.984837,105.859856,8.01,Hoang Mai,so do,co ban,Dong,Dong,1,5,37.681159,4


# Create feature price_class

In [None]:
# create the bins for the 13 classes
bins = [0,10,15,20,30,40,50,60,70,80,90,100,120,400]

# use the 'cut' function to divide the 'price' column into 14 classes
data['price_class'] = pd.cut(data['price'], bins, labels=False)
# you can now access the 'price_class' column to see which class each price belongs to
data.to_csv('/content/drive/MyDrive/file_find/data_extracted.csv', index=False)
data.head()

Unnamed: 0,n_rooms,n_toilets,area,x,y,distance_UBND,district,polistic,furniture,house_direct,balcony_direct,n_hospitals,n_schools,price,price_class
0,2,2,75.0,21.008852,105.743919,6.97,Nam Tu Liem,so do,day du,Dong - Bac,Dong - Nam,0,0,43.333333,5
1,3,2,100.0,21.084397,105.79236,4.33,Bac Tu Liem,so do,cao cap,Tay,Bac,1,5,39.0,4
2,3,3,102.0,21.002666,105.81543,4.17,Thanh Xuan,so do,cao cap,Tay - Bac,Dong - Nam,4,17,39.215686,4
3,3,3,103.0,21.002666,105.81543,4.17,Thanh Xuan,so do,cao cap,Tay - Bac,Dong - Nam,4,17,39.5,4
4,2,2,74.0,21.044233,105.767319,3.74,Bac Tu Liem,so do,day du,Bac,Dong - Nam,0,6,38.513514,4


# Data oversampling

In [None]:
import pandas as pd
from sklearn.utils import resample


# Get the unique class labels
class_labels = data.price_class.unique()

# Create a list to store the oversampled dataframes
data_oversampled = []

# Loop over each class label
for label in class_labels:
    # Separate majority and minority classes
    data_majority = data[data.price_class!=label]
    data_minority = data[data.price_class==label]
 
    # Upsample minority class
    data_minority_upsampled = resample(data_minority, 
                                     replace=True,     # sample with replacement
                                     n_samples=len(data_majority),    # to match majority class
                                     random_state=123) # reproducible results
 
    # Combine majority class with upsampled minority class
    data_upsampled = pd.concat([data_majority, data_minority_upsampled])
    
    # Add the oversampled dataframe to the list
    data_oversampled.append(data_upsampled)

# Concatenate all the oversampled dataframes
data_oversampled = pd.concat(data_oversampled)

# Display new class counts
print(data_oversampled.price_class.value_counts())

4     44534
5     37684
3     36484
6     21424
7     15564
2     14314
8     13024
9     12654
1     12174
11    11594
10    11514
12    11124
Name: price_class, dtype: int64


# XGBoost


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

# create the label encoder
le = LabelEncoder()

# fit and transform the non-numeric column
data_oversampled['polistic'] = le.fit_transform(data_oversampled['polistic'])
data_oversampled['furniture'] = le.fit_transform(data_oversampled['furniture'])
data_oversampled['district'] = le.fit_transform(data_oversampled['district'])
data_oversampled['house_direct'] = le.fit_transform(data_oversampled['house_direct'])
data_oversampled['balcony_direct'] = le.fit_transform(data_oversampled['balcony_direct'])

# create the feature and target arrays
X = data_oversampled[['area','n_rooms','n_toilets','x','y','polistic','furniture','district','distance_UBND','house_direct','balcony_direct','n_schools']] 
y = data_oversampled['price_class']

# Scale the feature data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

# create the xgboost model
model_xgb = xgb.XGBClassifier(max_depth=8,n_estimators=1000)

# fit the model to the training data
model_xgb.fit(X_train, y_train)

XGBClassifier(max_depth=8, n_estimators=1000, objective='multi:softprob')

In [None]:
# evaluate the model 
train_accuracy = model_xgb.score(X_train, y_train)
test_accuracy = model_xgb.score(X_test, y_test)

# make predictions
y_pred = model_xgb.predict(X_test)


# calculate the F1 score,recall,precision and AUC
f1 = f1_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')


print('Train Accuracy:', train_accuracy)
print('Test Accuracy:', test_accuracy)
print("F1 Score: ", f1)
print("Recall: ", recall)
print("Precision: ", precision)


Train Accuracy: 0.9753911292404606
Test Accuracy: 0.974782105828411
F1 Score:  0.9747633713197097
Recall:  0.974782105828411
Precision:  0.9748778431321198
