In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("taxi-fare-predict").getOrCreate()
spark

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/20 11:19:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
directory = "/home/ubuntu/working/datasource"
trip_files = "/trips/*"

trips_df = spark.read.csv(f"file:///{directory}/{trip_files}", inferSchema=True, header=True)
trips_df.printSchema()



root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



                                                                                

In [3]:
trips_df.createOrReplaceTempView("trips")

In [4]:
# 데이터 정제
query = """
SELECT
    t.trip_distance,
    t.total_amount

FROM trips t

WHERE t.total_amount < 200
  AND t.total_amount > 0
  AND t.passenger_count < 5
  AND TO_DATE(t.tpep_pickup_datetime) >= '2021-01-01'
  AND TO_DATE(t.tpep_pickup_datetime) < '2021-08-01'
  AND t.trip_distance < 10
  AND t.trip_distance > 0
"""

data_df = spark.sql(query)

In [5]:
data_df.show(5)

+-------------+------------+
|trip_distance|total_amount|
+-------------+------------+
|          0.8|         8.8|
|          0.9|         8.8|
|          2.8|        13.8|
|          1.4|        12.3|
|          2.0|        12.3|
+-------------+------------+
only showing top 5 rows



# train / test 분할

In [6]:
train_sdf, test_sdf = data_df.randomSplit([0.8, 0.2], seed=42)

In [8]:
train_sdf.cache()

DataFrame[trip_distance: double, total_amount: double]

# 벡터 어셈블링
`feature`가 하나만 있어도 벡터 형식으로 데이터를 뭉쳐야 한다.

In [9]:
from pyspark.ml.feature import VectorAssembler

vec_assembler = VectorAssembler(
    inputCols=["trip_distance"],
    outputCol= "features"
)

In [10]:
vec_train_sdf = vec_assembler.transform(train_sdf)
vec_train_sdf.show(5)

[Stage 3:>                                                          (0 + 1) / 1]

+-------------+------------+--------+
|trip_distance|total_amount|features|
+-------------+------------+--------+
|         0.01|        1.75|  [0.01]|
|         0.01|         3.0|  [0.01]|
|         0.01|         3.0|  [0.01]|
|         0.01|         3.0|  [0.01]|
|         0.01|         3.0|  [0.01]|
+-------------+------------+--------+
only showing top 5 rows



                                                                                

# 모델 생성 및 훈련

In [11]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(
    maxIter=50,
    featuresCol='features',
    labelCol='total_amount'
)

In [12]:
lr_model = lr.fit(vec_train_sdf)

23/11/20 11:32:03 WARN Instrumentation: [6497cbe7] regParam is zero, which might cause numerical instability and overfitting.
23/11/20 11:32:03 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/11/20 11:32:03 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
23/11/20 11:32:56 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                                

# 예측

In [13]:
# 테스트 데이터 변환
vec_test_sdf = vec_assembler.transform(test_sdf)
vec_test_sdf.show(5)

[Stage 8:>                                                          (0 + 1) / 1]

+-------------+------------+--------+
|trip_distance|total_amount|features|
+-------------+------------+--------+
|         0.01|         3.0|  [0.01]|
|         0.01|         3.0|  [0.01]|
|         0.01|        3.26|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
+-------------+------------+--------+
only showing top 5 rows



                                                                                

In [14]:
predictions = lr_model.transform(vec_test_sdf)
predictions.show(5)

[Stage 9:>                                                          (0 + 1) / 1]

+-------------+------------+--------+----------------+
|trip_distance|total_amount|features|      prediction|
+-------------+------------+--------+----------------+
|         0.01|         3.0|  [0.01]|7.96717487732717|
|         0.01|         3.0|  [0.01]|7.96717487732717|
|         0.01|        3.26|  [0.01]|7.96717487732717|
|         0.01|         3.3|  [0.01]|7.96717487732717|
|         0.01|         3.3|  [0.01]|7.96717487732717|
+-------------+------------+--------+----------------+
only showing top 5 rows



                                                                                

# 평가
lr_model.summary.rootMeanSquaredError

In [15]:
# RMSE 확인
lr_model.summary.rootMeanSquaredError

3.5112139987526794

In [16]:
# R^2 확인
lr_model.summary.r2

0.7630572156958163

In [17]:
spark.stop()