In [1]:
# spark session 생성 
from pyspark.sql import SparkSession

In [2]:
# 인스턴스 생성 (Out of memory 방지를 위해 MAX_MEMORY 설정)
MAX_MEMORY="5g"
spark = SparkSession.builder.appName("taxi-fare-prediciton")\
                .config("spark.executor.memory", MAX_MEMORY)\
                .config("spark.driver.memory", MAX_MEMORY)\
                .getOrCreate()

22/05/20 15:13:27 WARN Utils: Your hostname, gim-yelin-ui-iMac.local resolves to a loopback address: 127.0.0.1; using 192.168.219.101 instead (on interface en1)
22/05/20 15:13:27 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/20 15:13:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# 파일 불러오기
trip_files = "/Users/yello-ow/taxi/data/trips/*"
trips_df = spark.read.csv(f"file:///{trip_files}", inferSchema=True, header=True)

                                                                                

In [4]:
# 스키마 확인
trips_df.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: string (nullable = true)
 |-- tpep_dropoff_datetime: string (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [5]:
# SQL 사용을 위해 데이터를 TempView에 담기 
trips_df.createOrReplaceTempView("trips")

In [6]:
# 이전 분석을 토대로 데이터 전처리 (이상치 제거)
query = """
SELECT 
    trip_distance,
    total_amount
FROM
    trips
WHERE
    total_amount < 5000
    AND total_amount > 0
    AND trip_distance > 0
    AND trip_distance < 500
    AND passenger_count < 4
    AND TO_DATE(tpep_pickup_datetime) >= '2021-01-01'
    AND TO_DATE(tpep_pickup_datetime) < '2021-08-01'
"""

In [7]:
# 쿼리 적용시킨 데이터를 Tempview에 담기
data_df = spark.sql(query)
data_df.createOrReplaceTempView("data")

In [8]:
# 쿼리 적용 데이터 확인 
data_df.show()

+-------------+------------+
|trip_distance|total_amount|
+-------------+------------+
|          2.1|        11.8|
|          0.2|         4.3|
|         14.7|       51.95|
|         10.6|       36.35|
|         4.94|       24.36|
|          1.6|       14.15|
|          4.1|        17.3|
|          5.7|        21.8|
|          9.1|        28.8|
|          2.7|       18.95|
|         6.11|        24.3|
|         1.21|       10.79|
|          7.4|       33.92|
|         1.01|        10.3|
|         0.73|       12.09|
|         1.17|       12.36|
|         0.78|        9.96|
|         1.66|        12.3|
|         0.93|         9.3|
|         1.16|       11.84|
+-------------+------------+
only showing top 20 rows



In [9]:
# 데이터 통계치 확인 
data_df.describe().show()

[Stage 3:>                                                          (0 + 8) / 8]

+-------+------------------+------------------+
|summary|     trip_distance|      total_amount|
+-------+------------------+------------------+
|  count|           1174301|           1174301|
|   mean|2.6433179823571806|16.465600429524688|
| stddev|3.4295448542568203|11.700068144721914|
|    min|              0.01|              0.01|
|    max|             427.7|            2292.4|
+-------+------------------+------------------+



                                                                                

In [10]:
# train/test set split
train_df, test_df = data_df.randomSplit([0.8, 0.2], seed=5)

In [11]:
print(train_df.count())
print(test_df.count())

                                                                                

939563




234738


                                                                                

In [12]:
# spark에서 모델 학습을 위해 필요한 library
from pyspark.ml.feature import VectorAssembler

In [13]:
vassembler = VectorAssembler(inputCols=["trip_distance"], outputCol="features")

In [14]:
# train 데이터에 vectorassembler 적용 
vtrain_df = vassembler.transform(train_df)

In [15]:
vtrain_df.show()

+-------------+------------+--------+
|trip_distance|total_amount|features|
+-------------+------------+--------+
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.8|  [0.01]|
|         0.01|         3.8|  [0.01]|
|         0.01|         3.8|  [0.01]|
|         0.01|         3.8|  [0.01]|
|         0.01|         3.8|  [0.01]|
|         0.01|         4.8|  [0.01]|
+-------------+------------+--------+
only showing top 20 rows



[Stage 12:>                                                         (0 + 1) / 1]                                                                                

In [16]:
# 선형회귀모델 
from pyspark.ml.regression import LinearRegression

In [17]:
# baseline 모델
lr = LinearRegression(
    maxIter=50,
    labelCol="total_amount",
    featuresCol="features"
)

In [18]:
# 모델 train
model = lr.fit(vtrain_df)

22/05/20 15:13:49 WARN Instrumentation: [3d7a66f6] regParam is zero, which might cause numerical instability and overfitting.
22/05/20 15:13:50 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/05/20 15:13:50 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
22/05/20 15:13:51 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                                

In [19]:
# 모델 test
vtest_df = vassembler.transform(test_df)

In [20]:
prediction = model.transform(vtest_df)

In [21]:
prediction.show()

+-------------+------------+--------+-----------------+
|trip_distance|total_amount|features|       prediction|
+-------------+------------+--------+-----------------+
|         0.01|         3.3|  [0.01]|8.035911836584692|
|         0.01|         5.8|  [0.01]|8.035911836584692|
|         0.01|         5.8|  [0.01]|8.035911836584692|
|         0.01|         6.3|  [0.01]|8.035911836584692|
|         0.01|         6.8|  [0.01]|8.035911836584692|
|         0.01|         7.3|  [0.01]|8.035911836584692|
|         0.01|         7.8|  [0.01]|8.035911836584692|
|         0.01|        14.8|  [0.01]|8.035911836584692|
|         0.01|        35.8|  [0.01]|8.035911836584692|
|         0.01|        52.8|  [0.01]|8.035911836584692|
|         0.01|        55.3|  [0.01]|8.035911836584692|
|         0.02|         3.3|  [0.02]|8.067935043612483|
|         0.02|         3.3|  [0.02]|8.067935043612483|
|         0.02|         3.3|  [0.02]|8.067935043612483|
|         0.02|         3.7|  [0.02]|8.067935043

In [22]:
# 모델 평가
model.summary.rootMeanSquaredError

4.093535599192618

In [23]:
model.summary.r2

0.8793536246241528

In [24]:
# 모델을 서비스에 적용하기 위한 단계 
from pyspark.sql.types import DoubleType
distance_list = [1.1, 5.5, 10.5, 30.0]
distance_df = spark.createDataFrame(distance_list, DoubleType()).toDF("trip_distance")

In [25]:
distance_df.show()

+-------------+
|trip_distance|
+-------------+
|          1.1|
|          5.5|
|         10.5|
|         30.0|
+-------------+



In [26]:
vdistance_df = vassembler.transform(distance_df)

In [27]:
vdistance_df.show()

+-------------+--------+
|trip_distance|features|
+-------------+--------+
|          1.1|   [1.1]|
|          5.5|   [5.5]|
|         10.5|  [10.5]|
|         30.0|  [30.0]|
+-------------+--------+



In [28]:
model.transform(vdistance_df).show()

+-------------+--------+------------------+
|trip_distance|features|        prediction|
+-------------+--------+------------------+
|          1.1|   [1.1]|11.526441402613882|
|          5.5|   [5.5]|25.616652494841805|
|         10.5|  [10.5]| 41.62825600873717|
|         30.0|  [30.0]| 104.0735097129291|
+-------------+--------+------------------+



In [29]:
spark.stop()