In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
import numpy as np
import json

# khoi tao sparkSession
spark = SparkSession\
   .builder\
   .appName("Normalize Data")\
   .master("yarn")\
   .config("spark.submit.deployMode","client")\
   .enableHiveSupport()\
   .getOrCreate()

df = spark.read.text('/user/hadoopuser/distribute_people/*')
data_json = json.loads(df.take(1)[0]['value'])


In [2]:
# khoi tao schema cho du lieu
schema = StructType([ \
    StructField("Width",DoubleType(),True), \
    StructField("Length",DoubleType(),True), \
    StructField("Bedrooms",IntegerType(),True), \
    StructField("Bathrooms", IntegerType(), True), \
    StructField("District", StringType(), True), \
    StructField("Province", StringType(), True), \
    StructField("Price", LongType(), True)\
  ])

# doc du lieu csv tu hdfs
df1 = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load("/output/*")


In [3]:
schema2 = StructType([ \
    StructField("Width",DoubleType(),True), \
    StructField("Length",DoubleType(),True), \
    StructField("Bedrooms",IntegerType(),True), \
    StructField("Bathrooms", IntegerType(), True), \
    StructField("District", StringType(), True), \
    StructField("Province", StringType(), True), \
    StructField("Price", LongType(), True),\
    StructField("Distribute", StringType(), True) 
  ])
rdd1 = df1.rdd.map(lambda r: (r[0], r[1], r[2], r[3],r[4],r[5],r[6], data_json[r[5]][r[4]]['distribute']))
print(rdd1.count())
df4 = spark.createDataFrame(rdd1, schema2)
df4.show()

# df1.select("Province").filter()

65420
+-----+------+--------+---------+----------+---------------+-----------+----------+
|Width|Length|Bedrooms|Bathrooms|  District|       Province|      Price|Distribute|
+-----+------+--------+---------+----------+---------------+-----------+----------+
|  4.4|  68.0|       4|        6|   thu duc| tp ho chi minh| 6600000000|     4.792|
| 13.0|  23.0|       5|        6|    quan 1| tp ho chi minh|65000000000|    25.049|
|15.75|  19.0|       6|        6|    ba ria|ba ria-vung tau| 8000000000|     1.338|
| 12.2|  25.0|       6|        3|  tan binh| tp ho chi minh|45000000000|    20.511|
| 12.2|  25.0|       6|        3|  tan binh| tp ho chi minh|45000000000|    20.511|
| 18.0|  31.0|      25|        3|    quan 8| tp ho chi minh|35000000000|    22.522|
|  8.0|  20.0|       2|        2| luong son|       hoa binh|  780000000|       251|
|  8.0|  20.0|       2|        2| luong son|       hoa binh| 1500000000|       251|
|  8.0|  20.0|       6|        5|binh thanh| tp ho chi minh|2300000000

In [4]:
def convert_to_int(str):
    str = str.replace(".", "")
    return int(str)

convert_to_int_udf = udf(convert_to_int, IntegerType())
df4 = df4.withColumn('Distribute', convert_to_int_udf(col('Distribute')))
df4.printSchema()
df4.show()

root
 |-- Width: double (nullable = true)
 |-- Length: double (nullable = true)
 |-- Bedrooms: integer (nullable = true)
 |-- Bathrooms: integer (nullable = true)
 |-- District: string (nullable = true)
 |-- Province: string (nullable = true)
 |-- Price: long (nullable = true)
 |-- Distribute: integer (nullable = true)

+-----+------+--------+---------+----------+---------------+-----------+----------+
|Width|Length|Bedrooms|Bathrooms|  District|       Province|      Price|Distribute|
+-----+------+--------+---------+----------+---------------+-----------+----------+
|  4.4|  68.0|       4|        6|   thu duc| tp ho chi minh| 6600000000|      4792|
| 13.0|  23.0|       5|        6|    quan 1| tp ho chi minh|65000000000|     25049|
|15.75|  19.0|       6|        6|    ba ria|ba ria-vung tau| 8000000000|      1338|
| 12.2|  25.0|       6|        3|  tan binh| tp ho chi minh|45000000000|     20511|
| 12.2|  25.0|       6|        3|  tan binh| tp ho chi minh|45000000000|     20511|
| 18.0

In [5]:
w_max = df4.select('Width').agg({"Width": "max"}).collect()[0]['max(Width)']
l_max = df4.select('Length').agg({"Length": "max"}).collect()[0]['max(Length)']
be_max = df4.select('Bedrooms').agg({"Bedrooms": "max"}).collect()[0]['max(Bedrooms)']
ba_max = df4.select('Bathrooms').agg({"Bathrooms": "max"}).collect()[0]['max(Bathrooms)']
d_max = df4.select('Distribute').agg({"Distribute": "max"}).collect()[0]['max(Distribute)']

print(w_max)
print(l_max)
print(be_max)
print(ba_max)
print(d_max)

100.0
100.0
30
10
44863


In [6]:
import numpy as np
def div(val, max_val):
    return float(np.round((val / max_val), 5))

div_udf = udf(div, DoubleType())

df5 = df4.withColumn('Width', div_udf(col("Width"), lit(w_max)))\
.withColumn('Length', div_udf(col("Length"), lit(l_max)))\
.withColumn('Bedrooms', div_udf(col("Bedrooms"), lit(be_max)))\
.withColumn('Bathrooms', div_udf(col("Bathrooms"), lit(ba_max)))\
.withColumn('Distribute', div_udf(col("Distribute"), lit(d_max)))\

df5.show()
print(df5.count())
df5.repartition(1).write.format('csv')\
                  .option('header',True)\
                  .mode('overwrite')\
                  .option('sep',',')\
                  .save('/user/hadoopuser/data_pre_train')

print('Done!!!')
spark.stop()

+------+------+--------+---------+----------+---------------+-----------+----------+
| Width|Length|Bedrooms|Bathrooms|  District|       Province|      Price|Distribute|
+------+------+--------+---------+----------+---------------+-----------+----------+
| 0.044|  0.68| 0.13333|      0.6|   thu duc| tp ho chi minh| 6600000000|   0.10681|
|  0.13|  0.23| 0.16667|      0.6|    quan 1| tp ho chi minh|65000000000|   0.55834|
|0.1575|  0.19|     0.2|      0.6|    ba ria|ba ria-vung tau| 8000000000|   0.02982|
| 0.122|  0.25|     0.2|      0.3|  tan binh| tp ho chi minh|45000000000|   0.45719|
| 0.122|  0.25|     0.2|      0.3|  tan binh| tp ho chi minh|45000000000|   0.45719|
|  0.18|  0.31| 0.83333|      0.3|    quan 8| tp ho chi minh|35000000000|   0.50202|
|  0.08|   0.2| 0.06667|      0.2| luong son|       hoa binh|  780000000|   0.00559|
|  0.08|   0.2| 0.06667|      0.2| luong son|       hoa binh| 1500000000|   0.00559|
|  0.08|   0.2|     0.2|      0.5|binh thanh| tp ho chi minh|2300