In [1]:
from pyspark.sql import SparkSession

In [2]:
MAX_MEMORY = '25g'

spark = SparkSession.builder.appName('taxi-fare-predication') \
    .config("spark.excutor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .getOrCreate()

22/12/26 01:55:01 WARN Utils: Your hostname, PSui-MacBookPro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.7 instead (on interface en0)
22/12/26 01:55:01 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/26 01:55:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/12/26 01:55:01 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
target_directory = '../../../../data'
trip_files = f"{target_directory}/trips/*"

In [5]:
trips_df = spark.read.csv(f"{trip_files}", inferSchema=True, header=True)

                                                                                

In [6]:
trips_df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [7]:
trips_df.createOrReplaceTempView('trips')

# ML에 필요한 데이터만 추출
## 승객이 낸 돈에 총 금액을 예측함
## 터무니없느 금액은 필터링 함.

In [8]:
query = """
SELECT
    trip_distance,
    total_amount
FROM
    trips
WHERE
    total_amount < 5000      -- 최대 $5000 달러 미만
    AND total_amount > 0     -- 총 금액 0원 초과
    AND trip_distance > 0    -- 0마일 초과
    AND trip_distance < 500  -- 500마일 미만
    AND passenger_count < 4  -- 탑승자 4명 미만
    AND TO_DATE(tpep_pickup_datetime) >= '2021-01-01'
    AND TO_DATE(tpep_pickup_datetime) < '2021-08-01'
"""

data_df = spark.sql(query)
data_df.createOrReplaceTempView('data')

In [9]:
data_df.show()

+-------------+------------+
|trip_distance|total_amount|
+-------------+------------+
|          2.1|        11.8|
|          0.2|         4.3|
|         14.7|       51.95|
|         10.6|       36.35|
|         4.94|       24.36|
|          1.6|       14.15|
|          4.1|        17.3|
|          5.7|        21.8|
|          9.1|        28.8|
|          2.7|       18.95|
|         6.11|        24.3|
|         1.21|       10.79|
|          7.4|       33.92|
|         1.01|        10.3|
|         0.73|       12.09|
|         1.17|       12.36|
|         0.78|        9.96|
|         1.66|        12.3|
|         0.93|         9.3|
|         1.16|       11.84|
+-------------+------------+
only showing top 20 rows



In [10]:
data_df.describe().show()



+-------+------------------+------------------+
|summary|     trip_distance|      total_amount|
+-------+------------------+------------------+
|  count|          13126271|          13126271|
|   mean|2.8820783305479263|17.973117241906895|
| stddev| 3.820284175387752|12.975829282992352|
|    min|              0.01|              0.01|
|    max|             475.5|            4973.3|
+-------+------------------+------------------+



                                                                                

- 평균 2.88마일 , 평균 2만 원 (18달러)
- 표준 편차 3.8마일, 12.9 달러
- 최대 475마일, 4973 달러

In [11]:
train_df, test_df = data_df.randomSplit([0.8, 0.2], seed=1)

In [12]:
print(train_df.count())
print(test_df.count())

                                                                                

10500537




2625734


                                                                                

In [13]:
from pyspark.ml.feature import VectorAssembler

In [17]:
vassembler = VectorAssembler(inputCols=['trip_distance'],
                            outputCol='features')

In [18]:
v_train_df = vassembler.transform(train_df)

In [22]:
v_train_df.show()

[Stage 13:>                                                         (0 + 1) / 1]

+-------------+------------+--------+
|trip_distance|total_amount|features|
+-------------+------------+--------+
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
+-------------+------------+--------+
only showing top 20 rows



                                                                                

In [23]:
from pyspark.ml.regression import LinearRegression

In [24]:
lr = LinearRegression(
    maxIter=50,
    labelCol='total_amount',
    featuresCol='features'
)

In [25]:
model = lr.fit(v_train_df)

22/12/26 02:06:46 WARN Instrumentation: [b7d06a7b] regParam is zero, which might cause numerical instability and overfitting.


[Stage 14:>                                                       (0 + 10) / 13]

22/12/26 02:06:48 WARN InstanceBuilder$JavaBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
22/12/26 02:06:48 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/12/26 02:06:48 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS


                                                                                

22/12/26 02:06:51 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK


                                                                                

In [26]:
v_test_df = vassembler.transform(test_df)

In [27]:
predication = model.transform(v_test_df)

In [28]:
predication.show()

[Stage 18:>                                                         (0 + 1) / 1]

+-------------+------------+--------+----------------+
|trip_distance|total_amount|features|      prediction|
+-------------+------------+--------+----------------+
|         0.01|         3.3|  [0.01]|9.42546886221043|
|         0.01|         3.3|  [0.01]|9.42546886221043|
|         0.01|         3.3|  [0.01]|9.42546886221043|
|         0.01|         3.3|  [0.01]|9.42546886221043|
|         0.01|         3.3|  [0.01]|9.42546886221043|
|         0.01|         3.3|  [0.01]|9.42546886221043|
|         0.01|         3.3|  [0.01]|9.42546886221043|
|         0.01|         3.3|  [0.01]|9.42546886221043|
|         0.01|         3.3|  [0.01]|9.42546886221043|
|         0.01|         3.3|  [0.01]|9.42546886221043|
|         0.01|         3.3|  [0.01]|9.42546886221043|
|         0.01|         3.3|  [0.01]|9.42546886221043|
|         0.01|         3.3|  [0.01]|9.42546886221043|
|         0.01|         3.3|  [0.01]|9.42546886221043|
|         0.01|         3.8|  [0.01]|9.42546886221043|
|         

                                                                                

# LinearRegression 성능을 측정할 때 아래 두 가지 객체를 사용한다.
- model.summary.rootMeanSquaredError
- model.summary.r2

In [29]:
model.summary.rootMeanSquaredError

6.284656849365224

In [30]:
model.summary.r2

0.7660322673703551

- 0.76 의미
    - total_amount의 trip 데이터가 76% 정도 맞다는 뜻

In [31]:
from pyspark.sql.types import DoubleType

distance = [1.1 , 5.5, 10.5 , 30.0]

distance_df = spark.createDataFrame(distance, DoubleType()).toDF('trip_distance')

In [32]:
distance_df.show()

[Stage 22:>                                                         (0 + 1) / 1]

+-------------+
|trip_distance|
+-------------+
|          1.1|
|          5.5|
|         10.5|
|         30.0|
+-------------+



                                                                                

In [37]:
# model에 넣기 전 vector assembler를 거친다.

vdistance_df = vassembler.transform(distance_df)

In [38]:
vdistance_df.show()

+-------------+--------+
|trip_distance|features|
+-------------+--------+
|          1.1|   [1.1]|
|          5.5|   [5.5]|
|         10.5|  [10.5]|
|         30.0|  [30.0]|
+-------------+--------+



In [39]:
model.transform(vdistance_df).show()

+-------------+--------+------------------+
|trip_distance|features|        prediction|
+-------------+--------+------------------+
|          1.1|   [1.1]|12.669472738013734|
|          5.5|   [5.5]|25.764534255017907|
|         10.5|  [10.5]| 40.64528597888628|
|         30.0|  [30.0]| 98.68021770197295|
+-------------+--------+------------------+



In [40]:
spark.stop()