In [1]:
from pyspark.sql import SparkSession

In [2]:
MAX_MEMORY="5g"
spark = SparkSession.builder.appName("taxi-fare-prediciton")\
                .config("spark.executor.memory", MAX_MEMORY)\
                .config("spark.driver.memory", MAX_MEMORY)\
                .getOrCreate()

22/05/20 20:56:44 WARN Utils: Your hostname, gim-yelin-ui-iMac.local resolves to a loopback address: 127.0.0.1; using 192.168.219.101 instead (on interface en1)
22/05/20 20:56:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/20 20:56:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
data_dir = "/Users/yello-ow/taxi/spark/data/"

In [4]:
train_df = spark.read.parquet(f"{data_dir}/train/")
test_df = spark.read.parquet(f"{data_dir}/test/")

                                                                                

In [5]:
# 하이퍼파라미터 튜닝을 위한 data 
toy_df = train_df.sample(False, 0.1, seed=1)

In [6]:
toy_df.printSchema()

root
 |-- passenger_count: integer (nullable = true)
 |-- pickup_location_id: integer (nullable = true)
 |-- dropoff_location_id: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_time: integer (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- total_amount: double (nullable = true)



## Pre-Processing

In [7]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer

cat_feats = [
    "pickup_location_id",
    "dropoff_location_id",
    "day_of_week"
]

stages = []

for c in cat_feats:
    cat_indexer = StringIndexer(inputCol=c, outputCol= c + "_idx").setHandleInvalid("keep")
    onehot_encoder = OneHotEncoder(inputCols=[cat_indexer.getOutputCol()], outputCols=[c + "_onehot"])
    stages += [cat_indexer, onehot_encoder]

In [8]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

num_feats = [
    "passenger_count",
    "trip_distance",
    "pickup_time"
]

for n in num_feats:
    num_assembler = VectorAssembler(inputCols=[n], outputCol= n + "_vecotr")
    num_scaler = StandardScaler(inputCol=num_assembler.getOutputCol(), outputCol= n + "_scaled")
    stages += [num_assembler, num_scaler]

In [9]:
assembler_inputs = [c + "_onehot" for c in cat_feats] + [n + "_scaled" for n in num_feats]
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="feature_vector")
stages += [assembler]

## Hyperparameter Tuning

In [10]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder 
from pyspark.ml.evaluation import RegressionEvaluator

lr = LinearRegression(
    maxIter=30,
    solver="normal",
    labelCol='total_amount',
    featuresCol='feature_vector'
)

# crossvalidation을 위한 stage
cv_stages = stages + [lr]

In [11]:
# cv pipeline 생성 
cv_pipeline = Pipeline(stages=cv_stages)

In [12]:
param_grid = ParamGridBuilder()\
                .addGrid(lr.elasticNetParam, [0.1, 0.2, 0.3, 0.4, 0.5])\
                .addGrid(lr.regParam, [0.01, 0.02, 0.03, 0.04, 0.05])\
                .build()

In [13]:
cross_val = CrossValidator(estimator=cv_pipeline,
                           estimatorParamMaps=param_grid,
                           evaluator=RegressionEvaluator(labelCol="total_amount"),
                           numFolds=5)

In [14]:
cv_model = cross_val.fit(toy_df)

22/05/20 20:57:07 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
22/05/20 20:57:07 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
22/05/20 20:57:08 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/05/20 20:57:08 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

In [15]:
alpha = cv_model.bestModel.stages[-1]._java_obj.getElasticNetParam()
reg_param = cv_model.bestModel.stages[-1]._java_obj.getRegParam()

## Training

In [16]:
transform_stages = stages
pipeline = Pipeline(stages=transform_stages)
fitted_transformer = pipeline.fit(train_df)

                                                                                

In [17]:
vtrain_df = fitted_transformer.transform(train_df)

In [18]:

lr = LinearRegression(
    maxIter=50,
    solver="normal",
    labelCol="total_amount",
    featuresCol="feature_vector",
    elasticNetParam=alpha,
    regParam=reg_param,
)

In [19]:
vtrain_df.printSchema()

root
 |-- passenger_count: integer (nullable = true)
 |-- pickup_location_id: integer (nullable = true)
 |-- dropoff_location_id: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_time: integer (nullable = true)
 |-- day_of_week: string (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- pickup_location_id_idx: double (nullable = false)
 |-- pickup_location_id_onehot: vector (nullable = true)
 |-- dropoff_location_id_idx: double (nullable = false)
 |-- dropoff_location_id_onehot: vector (nullable = true)
 |-- day_of_week_idx: double (nullable = false)
 |-- day_of_week_onehot: vector (nullable = true)
 |-- passenger_count_vecotr: vector (nullable = true)
 |-- passenger_count_scaled: vector (nullable = true)
 |-- trip_distance_vecotr: vector (nullable = true)
 |-- trip_distance_scaled: vector (nullable = true)
 |-- pickup_time_vecotr: vector (nullable = true)
 |-- pickup_time_scaled: vector (nullable = true)
 |-- feature_vector: vector (nul

In [20]:
model = lr.fit(vtrain_df)

                                                                                

In [21]:
vtest_df = fitted_transformer.transform(test_df)

In [22]:
predictions = model.transform(vtest_df)

In [23]:
predictions.cache()

DataFrame[passenger_count: int, pickup_location_id: int, dropoff_location_id: int, trip_distance: double, pickup_time: int, day_of_week: string, total_amount: double, pickup_location_id_idx: double, pickup_location_id_onehot: vector, dropoff_location_id_idx: double, dropoff_location_id_onehot: vector, day_of_week_idx: double, day_of_week_onehot: vector, passenger_count_vecotr: vector, passenger_count_scaled: vector, trip_distance_vecotr: vector, trip_distance_scaled: vector, pickup_time_vecotr: vector, pickup_time_scaled: vector, feature_vector: vector, prediction: double]

In [24]:
predictions.select(["trip_distance", "day_of_week", "total_amount", "prediction"]).show()

[Stage 3046:>                                                       (0 + 1) / 1]

+-------------+-----------+------------+------------------+
|trip_distance|day_of_week|total_amount|        prediction|
+-------------+-----------+------------+------------------+
|          0.8|  Wednesday|         5.8|  7.69462101243899|
|          4.8|  Wednesday|        28.5|24.010347213535233|
|          2.1|    Tuesday|        15.8|15.583409150386142|
|          3.5|    Tuesday|       22.45|19.468292024222382|
|          6.1|     Monday|        26.3| 28.39942675137028|
|          2.1|     Sunday|        14.8|13.946336976972514|
|          0.4|    Tuesday|         5.8| 7.634021550500015|
|          1.0|     Friday|         9.8| 9.203781746421113|
|          1.3|   Saturday|         8.3| 9.174011850628828|
|          2.2|    Tuesday|       15.95|13.403969441237292|
|          5.2|     Monday|       26.15|23.890497962133416|
|         12.4|   Saturday|        51.6|48.483750631227565|
|          2.0|    Tuesday|       14.75|  13.4131209508338|
|          4.1|     Monday|        22.8|

                                                                                

In [25]:
model.summary.rootMeanSquaredError

3.789833058355195

In [26]:
model.summary.r2

0.895397549211991

### model save

In [27]:
model_dir = "/Users/yello-ow/taxi/spark/data/model"
model.save(model_dir)

                                                                                

In [28]:
pipe_dir = "/Users/yello-ow/taxi/spark/data/pipeline"
pipeline.save(pipe_dir)

In [29]:
## model 불러오기 

In [30]:
from pyspark.ml.regression import LinearRegressionModel

In [31]:
lr_model = LinearRegressionModel().load(model_dir)

In [32]:
predictions = lr_model.transform(vtest_df)

In [33]:
predictions.select(['trip_distance','pickup_time', 'day_of_week', 'prediction']).show()

+-------------+-----------+-----------+------------------+
|trip_distance|pickup_time|day_of_week|        prediction|
+-------------+-----------+-----------+------------------+
|          0.8|          8|  Wednesday|  7.69462101243899|
|          4.8|          7|  Wednesday|24.010347213535233|
|          2.1|         19|    Tuesday|15.583409150386142|
|          3.5|          7|    Tuesday|19.468292024222382|
|          6.1|         13|     Monday| 28.39942675137028|
|          2.1|         11|     Sunday|13.946336976972514|
|          0.4|         17|    Tuesday| 7.634021550500015|
|          1.0|          8|     Friday| 9.203781746421113|
|          1.3|         10|   Saturday| 9.174011850628828|
|          2.2|         17|    Tuesday|13.403969441237292|
|          5.2|          7|     Monday|23.890497962133416|
|         12.4|         10|   Saturday|48.483750631227565|
|          2.0|         10|    Tuesday|  13.4131209508338|
|          4.1|         16|     Monday| 20.2169299509963

In [34]:
spark.stop()