In [None]:
# BigQuery to GCS
"""
create table dsp_demo.natality as (
  select *
  from `bigquery-public-data.samples.natality`
  order by rand()
  limit 10000 
) 
"""

In [None]:
!aws s3 cp dsdemo.json s3://spark_pipeline/secrets/gcp_credentials.json

!aws s3 ls  s3://spark_pipeline/secrets/

In [None]:
creds_file = '/databricks/gcp_credentials.json'
creds = sc.textFile('s3://spark_pipeline/secrets/gcp_credentials.json')

with open(creds_file, 'w') as file:
    for line in creds.take(100):
        file.write(line + "\n")

In [None]:
sc._jsc.hadoopConfiguration().set("fs.gs.impl", 
           "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
sc._jsc.hadoopConfiguration().set("fs.gs.project.id", 
                                                 "my_project_id")
sc._jsc.hadoopConfiguration().set(
         "mapred.bq.auth.service.account.json.keyfile", creds_file)
sc._jsc.hadoopConfiguration().set(
             "fs.gs.auth.service.account.json.keyfile", creds_file)

In [None]:
natality_path = "gs://dsp_model_store/natality/avro"
natality_df = spark.read.format("avro").load(natality_path)
display(natality_df)

natality_df.createOrReplaceTempView("natality_df")

natality_df = spark.sql("""
SELECT year, plurality, apgar_5min, 
       mother_age, father_age,    
       gestation_weeks, ever_born
       ,case when mother_married = true 
             then 1 else 0 end as mother_married
       ,weight_pounds as weight
       ,case when rand() < 0.5 then 1 else 0 end as test
from natality_df       
""").fillna(0)

trainDF = natality_df.filter("test == 0")
testDF = natality_df.filter("test == 1")
display(natality_df)

In [None]:
from pyspark.ml.feature import VectorAssembler

# create a vector representation
assembler = VectorAssembler(inputCols= trainDF.schema.names[0:8],
                            outputCol="features" )

trainVec = assembler.transform(trainDF).select('weight','features')
testVec = assembler.transform(testDF).select('weight', 'features')

In [None]:
# Model
from pyspark.ml.tuning import ParamGridBuilder 
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.tuning import CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator

folds = 3
rf_trees = [ 50, 100  ]
rf_depth = [ 4, 5 ]

rf= RandomForestRegressor(featuresCol='features',labelCol='weight')

paramGrid = ParamGridBuilder().addGrid(rf.numTrees, rf_trees).addGrid(rf.maxDepth, rf_depth).build()
crossval = CrossValidator(estimator=rf, estimatorParamMaps =
                         paramGrid, evaluator=RegressionEvaluator(
                              labelCol='weight'), numFolds = folds)       
rfModel = crossval.fit(trainVec)
   
predsDF = rfModel.transform(testVec).select("weight", "prediction")

In [None]:
import time

out_path = "gs://spark_pipeline/natality/preds-{time}/".
                             format(time = int(time.time()*1000))
predsDF.write.mode('overwrite').format("avro").save(out_path)
print(out_path)