# Import thư viện

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import findspark
findspark.init()

import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [3]:
# khoi tao sparkSession
spark = SparkSession\
   .builder\
   .appName("Train")\
   .master("yarn")\
   .config("spark.submit.deployMode","client")\
   .enableHiveSupport()\
   .getOrCreate()

schema_data_clean = StructType([ \
    StructField("Width",DoubleType(),True), \
    StructField("Length",DoubleType(),True), \
    StructField("Bedrooms",IntegerType(),True), \
    StructField("Bathrooms", IntegerType(), True), \
    StructField("District", StringType(), True), \
    StructField("Province", StringType(), True), \
    StructField("Price", LongType(), True)\
  ])


schema_normalized = StructType([ \
    StructField("Width",DoubleType(),True), \
    StructField("Length",DoubleType(),True), \
    StructField("Bedrooms",DoubleType(),True), \
    StructField("Bathrooms", DoubleType(), True), \
    StructField("District", StringType(), True), \
    StructField("Province", StringType(), True), \
    StructField("Price", LongType(), True),\
    StructField("Distribute", DoubleType(), True) 
  ])

df_normalized = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema_normalized) \
      .load("/user/hadoopuser/data_pre_train/*")

df_normalized.show()

df_clean = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema_data_clean) \
      .load("/output/*")

df_clean.show()

+------+------+--------+---------+----------+---------------+-----------+----------+
| Width|Length|Bedrooms|Bathrooms|  District|       Province|      Price|Distribute|
+------+------+--------+---------+----------+---------------+-----------+----------+
| 0.044|  0.68| 0.13333|      0.6|   thu duc| tp ho chi minh| 6600000000|   0.10681|
|  0.13|  0.23| 0.16667|      0.6|    quan 1| tp ho chi minh|65000000000|   0.55834|
|0.1575|  0.19|     0.2|      0.6|    ba ria|ba ria-vung tau| 8000000000|   0.02982|
| 0.122|  0.25|     0.2|      0.3|  tan binh| tp ho chi minh|45000000000|   0.45719|
| 0.122|  0.25|     0.2|      0.3|  tan binh| tp ho chi minh|45000000000|   0.45719|
|  0.18|  0.31| 0.83333|      0.3|    quan 8| tp ho chi minh|35000000000|   0.50202|
|  0.08|   0.2| 0.06667|      0.2| luong son|       hoa binh|  780000000|   0.00559|
|  0.08|   0.2| 0.06667|      0.2| luong son|       hoa binh| 1500000000|   0.00559|
|  0.08|   0.2|     0.2|      0.5|binh thanh| tp ho chi minh|2300

# Chuẩn bị dữ liệu train, test cho mô hình

In [4]:

# data = pd.read_csv("data_normalized.csv")
data = df_normalized.toPandas()
data

Unnamed: 0,Width,Length,Bedrooms,Bathrooms,District,Province,Price,Distribute
0,0.0440,0.680,0.13333,0.6,thu duc,tp ho chi minh,6600000000,0.10681
1,0.1300,0.230,0.16667,0.6,quan 1,tp ho chi minh,65000000000,0.55834
2,0.1575,0.190,0.20000,0.6,ba ria,ba ria-vung tau,8000000000,0.02982
3,0.1220,0.250,0.20000,0.3,tan binh,tp ho chi minh,45000000000,0.45719
4,0.1220,0.250,0.20000,0.3,tan binh,tp ho chi minh,45000000000,0.45719
...,...,...,...,...,...,...,...,...
65415,0.4000,0.103,0.13333,0.3,tay ho,ha noi,42500000000,0.12132
65416,0.4000,0.103,0.13333,0.3,tay ho,ha noi,27500000000,0.12132
65417,0.6700,0.110,0.13333,0.3,quan 4,tp ho chi minh,4600000000,0.99572
65418,0.6700,0.110,0.13333,0.3,quan 4,tp ho chi minh,4700000000,0.99572


In [5]:
data.describe()

Unnamed: 0,Width,Length,Bedrooms,Bathrooms,Price,Distribute
count,65420.0,65420.0,65420.0,65420.0,65420.0,65420.0
mean,0.053636,0.129694,0.130798,0.339071,7075452000.0,0.410997
std,0.048693,0.055669,0.080261,0.132677,8979832000.0,0.298438
min,0.008,0.01,0.03333,0.1,100000000.0,0.00036
25%,0.04,0.1,0.1,0.3,2600000000.0,0.10849
50%,0.04,0.11,0.13333,0.3,4400000000.0,0.41914
75%,0.05,0.15,0.13333,0.4,7700000000.0,0.64387
max,1.0,1.0,1.0,1.0,100000000000.0,1.0


In [6]:
X = data.loc[:, ['Width', 'Length', 'Bedrooms', 'Bathrooms', 'Distribute']]
y = data.loc[:, ['Price']]
X_train,X_test,y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=0)

# Mô hình Linear Regression

In [7]:
regr = LinearRegression()

regr.fit(X_train, y_train)

y_pred = regr.predict(X_test)

print('Mean squared error: %.2f'% mean_squared_error(y_test, y_pred))
print('R2_score: %.4f'% r2_score(y_test, y_pred))

Mean squared error: 61206711558527033344.00
R2_score: 0.2176


# Mô hình kNN Regression

In [8]:
n_neighbors=20
for n in range(1,n_neighbors):
    print(n)
    for i, weights in enumerate(['uniform', 'distance']):
        knn = KNeighborsRegressor(n_neighbors=n, weights=weights, p=1)
        y_ = knn.fit(X_train, y_train).predict(X_test)
        print('Mean squared error: %.2f'% mean_squared_error(y_test, y_))
        print('R2_score: %.4f'%r2_score(y_test, y_))
    print('\n')

1
Mean squared error: 80772930232569544704.00
R2_score: -0.0325
Mean squared error: 80772930232569544704.00
R2_score: -0.0325


2
Mean squared error: 61586897954598748160.00
R2_score: 0.2127
Mean squared error: 62981330742504570880.00
R2_score: 0.1949


3
Mean squared error: 55407387440036864000.00
R2_score: 0.2917
Mean squared error: 57379043397140774912.00
R2_score: 0.2665


4
Mean squared error: 52347006000121716736.00
R2_score: 0.3308
Mean squared error: 54003363287850106880.00
R2_score: 0.3097


5
Mean squared error: 51368004003524861952.00
R2_score: 0.3434
Mean squared error: 52722319879475945472.00
R2_score: 0.3260


6
Mean squared error: 50439360882314870784.00
R2_score: 0.3552
Mean squared error: 51785089977281839104.00
R2_score: 0.3380


7
Mean squared error: 50140378340753375232.00
R2_score: 0.3590
Mean squared error: 51192778930534039552.00
R2_score: 0.3456


8
Mean squared error: 49498486478086234112.00
R2_score: 0.3673
Mean squared error: 50460041794370330624.00
R2_score:

# Sử dụng mô hình tốt nhất để dự đoán một bản ghi

In [9]:
# data_raw = pd.read_csv("data_cleann.csv")
data_raw = df_clean.toPandas()
max_value = []
int_column = ['Width', 'Length', 'Bedrooms', 'Bathrooms']
for _column in int_column:
    max_val = int(data_raw[_column].max())
    max_value.append(max_val)
print(max_value)


[100, 100, 30, 10]


In [10]:
def predict_one(item):
  x = [0,0,0,0,0]
  for i in range(4):
    x[i]=item[i]/max_value[i]
  list_distribute= data.loc[(data['District']==item[4])&(data['Province']==item[5])]['Distribute']
  x[4] = list(list_distribute)[0]
  knn = KNeighborsRegressor(n_neighbors=11, weights='uniform', p=1)
  x = np.array(x).reshape(1,-1)
  y_ = knn.fit(X_train, y_train).predict(x)
  print(y_)


In [11]:
x = [30, 20, 5, 3, 'thu duc', 'tp ho chi minh']
predict_one(x)

[[9.80272727e+09]]


In [12]:
spark.stop()