In [None]:
# 08_Regression_1.ipynb

In [30]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("text-fare-prediction").getOrCreate()

In [18]:
import os
cwd =os.getcwd()
trip_data_path = os.path.join(cwd, 'learning_spark_data','trips','*.csv')
trip_data_path


'/home/jovyan/work/learning_spark_data/trips/*.csv'

In [19]:
file_path = f"file:///{trip_data_path.replace(os.sep,'/') }"
file_path

'file:////home/jovyan/work/learning_spark_data/trips/*.csv'

In [22]:
trip_df = spark.read.csv(file_path, inferSchema=True, header=True)
trip_df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [31]:
trip_df.createOrReplaceTempView('trips')

In [32]:
query = """
SELECT
    trip_distance,
    total_amount
FROM trips

WHERE total_amount < 5000
  AND total_amount > 0
  AND trip_distance > 0
  AND trip_distance < 500
  AND passenger_count < 4
  AND TO_DATE(tpep_pickup_datetime) >= "2021-01-01"
  AND TO_DATE(tpep_pickup_datetime) < "2021-08-01"
"""

In [33]:
trip_df = spark.sql(query)
trip_df.createOrReplaceTempView('data')

In [34]:
query = '''
SELECT * 
FROM DATA
LIMIT 5
'''

spark.sql(query).show()

+-------------+------------+
|trip_distance|total_amount|
+-------------+------------+
|         16.5|       70.07|
|         1.13|       11.16|
|         2.68|       18.59|
|         12.4|        43.8|
|          9.7|        32.3|
+-------------+------------+



In [41]:
# train, test split 0.2 , seed =1
train_data, test_data = trip_df.randomSplit([0.8, 0.2], seed=1)

In [42]:
# VectorAssembler > feature : trip_distance, target: total_amount
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["trip_distance"],  # feature 컬럼
    outputCol="features"          # 생성될 벡터 컬럼명
)

train_data = assembler.transform(train_data)
test_data = assembler.transform(test_data)


In [43]:
# LinearRegression 생성 maxIter=50 labelCol='total_amount' featuresCol= 'feature'
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(maxIter=50, labelCol='total_amount', featuresCol='features')

In [44]:
# fit
lr_model = lr.fit(train_data)

In [47]:
# pred
pred = lr_model.transform(test_data)

In [48]:
pred.select("trip_distance", "total_amount", "prediction").show(5)

+-------------+------------+-----------------+
|trip_distance|total_amount|       prediction|
+-------------+------------+-----------------+
|         0.01|         3.3|9.430440745312902|
|         0.01|         3.3|9.430440745312902|
|         0.01|         3.3|9.430440745312902|
|         0.01|         3.3|9.430440745312902|
|         0.01|         3.3|9.430440745312902|
+-------------+------------+-----------------+
only showing top 5 rows



In [54]:
# 평가
lr_model.summary.rootMeanSquaredError #RMSE - 학습(train) 데이터에 대한 오차

6.30781413196623

In [55]:
# 평가
lr_model.summary.r2 # 학습 데이터 기준

0.7648633777017714

In [None]:
# 새로운 데이터로 예측하기

In [60]:
from pyspark.sql.types import DoubleType

In [63]:
new_distance_list = [1.1, 9.4, 10.2, 30.0]
distance_df = spark.createDataFrame(new_distance_list, DoubleType()).toDF('trip_distance')
distance_df.show()

+-------------+
|trip_distance|
+-------------+
|          1.1|
|          9.4|
|         10.2|
|         30.0|
+-------------+



In [64]:
distance_df = assembler.transform(distance_df)
lr_model.transform(distance_df).show()

+-------------+--------+------------------+
|trip_distance|features|        prediction|
+-------------+--------+------------------+
|          1.1|   [1.1]|12.672809485363317|
|          9.4|   [9.4]| 37.36240631327014|
|         10.2|  [10.2]| 39.74212648945393|
|         30.0|  [30.0]| 98.64020085000274|
+-------------+--------+------------------+

