In [1]:
from pyspark.sql import SparkSession

In [2]:
MAX_MEMORY = '25g'

spark = SparkSession.builder.appName('taxi-fare-predication') \
    .config("spark.excutor.memory", MAX_MEMORY) \
    .config("spark.driver.memory", MAX_MEMORY) \
    .getOrCreate()

22/12/26 02:53:34 WARN Utils: Your hostname, PSui-MacBookPro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.7 instead (on interface en0)
22/12/26 02:53:34 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/26 02:53:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/12/26 02:53:34 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
data_directory = '../../../../data'

In [4]:
train_df = spark.read.parquet(f'{data_directory}/train/')
test_df = spark.read.parquet(f'{data_directory}/test/')

                                                                                

In [6]:
toy_df = train_df.sample(False, 0.1, seed=1)
toy_df.printSchema()

root
 |-- passenger_count: double (nullable = true)
 |-- pickup_location_id: integer (nullable = true)
 |-- dropoff_location_id: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_time: integer (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- total_amount: double (nullable = true)



In [7]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

cat_feats = [
    'pickup_location_id',
    'dropoff_location_id',
    'day_of_week',
]

stages = []

for c in cat_feats:
    cat_indexer = StringIndexer(inputCol=c, outputCol=c + '_idx').setHandleInvalid('keep')
    onehot_encoder = OneHotEncoder(inputCols=[cat_indexer.getOutputCol()], outputCols=[c + '_onehot'])
    stages += (cat_indexer, onehot_encoder)

In [8]:
stages

[StringIndexer_88972bb9c2e6,
 OneHotEncoder_4387a7a8bc9a,
 StringIndexer_71bb39a7efc2,
 OneHotEncoder_3a856e4c5a96,
 StringIndexer_2c633fc2f52c,
 OneHotEncoder_d048548160b0]

- StringIndexer + OneHotEncoder 하나씩 세트가 됨.

In [9]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

# 숫자이고, 감소 또는 증가의 의미가 있는 값들
num_feats = [
    'passenger_count',
    'trip_distance',
    'pickup_time'
]

for n in num_feats:
    num_assembler = VectorAssembler(inputCols=[n], outputCol=n + '_vector')
    num_scaler = StandardScaler(inputCol=num_assembler.getOutputCol(), outputCol=n + "_scaled")
    stages += [num_assembler, num_scaler]

In [10]:
stages

[StringIndexer_88972bb9c2e6,
 OneHotEncoder_4387a7a8bc9a,
 StringIndexer_71bb39a7efc2,
 OneHotEncoder_3a856e4c5a96,
 StringIndexer_2c633fc2f52c,
 OneHotEncoder_d048548160b0,
 VectorAssembler_6b2d438f8f42,
 StandardScaler_69a33ba6ef5f,
 VectorAssembler_3e5c82a96657,
 StandardScaler_40ee98f28591,
 VectorAssembler_289438ca26f9,
 StandardScaler_a819fca2b600]

## 두 개를 합치는 작업은 VectorAssembler로 가능하다.

In [11]:
# onehot, scaled
assembler_inputs = [c + '_onehot' for c in cat_feats] + [c + '_scaled' for c in num_feats]
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol='feature_vector')
stages += [assembler]

In [12]:
# HyperParameter Tuning

In [13]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator

lr = LinearRegression(
    maxIter=50,
    solver='normal',
    labelCol='total_amount',
    featuresCol='feature_vector',

)

# cross validation pipeline으로 찾으면서 파라미터를 추가한다.
cv_stages = stages + [lr]

In [14]:
cv_pipeline = Pipeline(stages=cv_stages)

In [15]:
param_grid = ParamGridBuilder() \
    .addGrid(lr.elasticNetParam, [0.1, .2, .3, .4, .5]) \
    .addGrid(lr.regParam, [.01, .02, .03, .04, .05]) \
    .build()

In [16]:
cross_val = CrossValidator(estimator=cv_pipeline,
                           estimatorParamMaps=param_grid,
                           evaluator=RegressionEvaluator(labelCol='total_amount'),
                           numFolds=5)


In [17]:
cv_model = cross_val.fit(toy_df)

[Stage 20:>                                                       (0 + 10) / 12]

22/12/26 03:07:13 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/12/26 03:07:13 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
22/12/26 03:07:13 WARN InstanceBuilder$JavaBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


                                                                                

22/12/26 03:07:14 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/12/26 03:07:14 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS


                                                                                

## 최적의 parameter를 알아냄

In [18]:
alpha = cv_model.bestModel.stages[-1]._java_obj.getElasticNetParam()
reg_param = cv_model.bestModel.stages[-1]._java_obj.getRegParam()

## Traning

In [19]:
transform_stages = stages
pipeline = Pipeline(stages=transform_stages)
fitted_transformer = pipeline.fit(train_df)


                                                                                

In [20]:
v_train_df = fitted_transformer.transform(train_df)

In [21]:
lr = LinearRegression(
    maxIter=50,
    solver="normal",
    labelCol="total_amount",
    featuresCol="feature_vector",
    elasticNetParam=alpha,
    regParam=reg_param,
)

In [23]:
v_train_df.printSchema()

root
 |-- passenger_count: double (nullable = true)
 |-- pickup_location_id: integer (nullable = true)
 |-- dropoff_location_id: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_time: integer (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- pickup_location_id_idx: double (nullable = false)
 |-- pickup_location_id_onehot: vector (nullable = true)
 |-- dropoff_location_id_idx: double (nullable = false)
 |-- dropoff_location_id_onehot: vector (nullable = true)
 |-- day_of_week_idx: double (nullable = false)
 |-- day_of_week_onehot: vector (nullable = true)
 |-- passenger_count_vector: vector (nullable = true)
 |-- passenger_count_scaled: vector (nullable = true)
 |-- trip_distance_vector: vector (nullable = true)
 |-- trip_distance_scaled: vector (nullable = true)
 |-- pickup_time_vector: vector (nullable = true)
 |-- pickup_time_scaled: vector (nullable = true)
 |-- feature_vector: vector (null

In [24]:
model = lr.fit(v_train_df)

                                                                                

In [27]:
v_test_df = fitted_transformer.transform(test_df)

In [28]:
predictions = model.transform(v_test_df)

In [29]:
predictions.cache()

DataFrame[passenger_count: double, pickup_location_id: int, dropoff_location_id: int, trip_distance: double, pickup_time: int, day_of_week: string, total_amount: double, pickup_location_id_idx: double, pickup_location_id_onehot: vector, dropoff_location_id_idx: double, dropoff_location_id_onehot: vector, day_of_week_idx: double, day_of_week_onehot: vector, passenger_count_vector: vector, passenger_count_scaled: vector, trip_distance_vector: vector, trip_distance_scaled: vector, pickup_time_vector: vector, pickup_time_scaled: vector, feature_vector: vector, prediction: double]

In [30]:
predictions.select(["trip_distance", "day_of_week", "total_amount", "prediction"]).show()

[Stage 3046:>                                                       (0 + 1) / 1]

+-------------+-----------+------------+------------------+
|trip_distance|day_of_week|total_amount|        prediction|
+-------------+-----------+------------+------------------+
|          0.7|   Saturday|       12.35|12.382929491402097|
|          3.1|    Tuesday|        18.0|17.722364162327906|
|          2.1|   Saturday|       15.35|16.640029629127632|
|          1.7|   Saturday|        13.3|14.245353623831388|
|          4.1|     Friday|        21.3|20.838734893742604|
|          1.4|     Friday|         8.3|12.209108612619954|
|          7.3|    Tuesday|        29.3| 28.01987861394699|
|          0.7|  Wednesday|         5.8|  9.95413931143023|
|          5.0|  Wednesday|        24.3|21.253243158062972|
|          6.7|   Saturday|        29.8| 37.40742729747107|
|         16.8|     Friday|       82.37| 70.89368339633509|
|         29.3|     Monday|        80.8|102.88940194075326|
|          4.1|     Friday|        20.8|21.994316288800498|
|          0.1|  Wednesday|        55.3|

                                                                                

In [31]:
model.summary.rootMeanSquaredError

5.849215298909927

In [32]:
model.summary.r2

0.7969367132378183

In [33]:
model_directory = '../../../../data/model'
model.save(model_directory)


In [35]:
from pyspark.ml.regression import LinearRegressionModel
lr_model = LinearRegressionModel().load(model_directory)

In [39]:
m_predictions = lr_model.transform(v_test_df)

In [40]:
m_predictions.select(["trip_distance", "day_of_week", "total_amount", "prediction"]).show()

+-------------+-----------+------------+------------------+
|trip_distance|day_of_week|total_amount|        prediction|
+-------------+-----------+------------+------------------+
|          0.7|   Saturday|       12.35|12.382929491402097|
|          3.1|    Tuesday|        18.0|17.722364162327906|
|          2.1|   Saturday|       15.35|16.640029629127632|
|          1.7|   Saturday|        13.3|14.245353623831388|
|          4.1|     Friday|        21.3|20.838734893742604|
|          1.4|     Friday|         8.3|12.209108612619954|
|          7.3|    Tuesday|        29.3| 28.01987861394699|
|          0.7|  Wednesday|         5.8|  9.95413931143023|
|          5.0|  Wednesday|        24.3|21.253243158062972|
|          6.7|   Saturday|        29.8| 37.40742729747107|
|         16.8|     Friday|       82.37| 70.89368339633509|
|         29.3|     Monday|        80.8|102.88940194075326|
|          4.1|     Friday|        20.8|21.994316288800498|
|          0.1|  Wednesday|        55.3|

In [41]:
m_predictions.show()

+---------------+------------------+-------------------+-------------+-----------+-----------+------------+----------------------+-------------------------+-----------------------+--------------------------+---------------+------------------+----------------------+----------------------+--------------------+--------------------+------------------+--------------------+--------------------+------------------+
|passenger_count|pickup_location_id|dropoff_location_id|trip_distance|pickup_time|day_of_week|total_amount|pickup_location_id_idx|pickup_location_id_onehot|dropoff_location_id_idx|dropoff_location_id_onehot|day_of_week_idx|day_of_week_onehot|passenger_count_vector|passenger_count_scaled|trip_distance_vector|trip_distance_scaled|pickup_time_vector|  pickup_time_scaled|      feature_vector|        prediction|
+---------------+------------------+-------------------+-------------+-----------+-----------+------------+----------------------+-------------------------+----------------------

In [42]:
spark.stop()
