# Logistic Regression Cross Validation Script
__`MIDS w261: Machine Learning at Scale | UC Berkeley School of Information | Fall 2018`__




### Notebook Setup

In [1]:
# global variables
PROJECT_ID = 'w262-245821' # fill in your GCP project id
BUCKET_NAME = 'w261_sj_data' # fill in the name of your GCP bucket
CLUSTER_NAME = 'w261-sj' # choose a cluster name, this should include only a-z, 0-9 & start with a letter
HOME = '/Users/sid'

### Submission Script
The cell below will create a python script in the current working directory called `submit_job_to_cluster.py` -- this script will help you run your own spark jobs on the cluster. You can read more about it in the [w261-environment](https://github.com/UCB-w261/w261-environment/tree/master/gcp-files/dataproc) repo.

In [2]:
%%writefile submit_job_to_cluster.py
#!/usr/bin/env python
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

""" Sample command-line program for listing Google Dataproc Clusters"""

import argparse
import os

from google.cloud import storage
import googleapiclient.discovery

DEFAULT_FILENAME = 'pyspark_sort.py'


def get_default_pyspark_file():
    """Gets the PySpark file from this directory"""
    current_dir = os.path.dirname(os.path.abspath(__file__))
    f = open(os.path.join(current_dir, DEFAULT_FILENAME), 'rb')
    return f, DEFAULT_FILENAME


def get_pyspark_file(filename):
    f = open(filename, 'rb')
    return f, os.path.basename(filename)


def upload_pyspark_file(project_id, bucket_name, filename, file):
    """Uploads the PySpark file in this directory to the configured
    input bucket."""
    print('Uploading pyspark file to GCS')
    client = storage.Client(project=project_id)
    bucket = client.get_bucket(bucket_name)
    blob = bucket.blob(filename)
    blob.upload_from_file(file)


def download_output(project_id, cluster_id, output_bucket, job_id):
    """Downloads the output file from Cloud Storage and returns it as a
    string."""
    print('Downloading output file')
    client = storage.Client(project=project_id)
    bucket = client.get_bucket(output_bucket)
    output_blob = (
        'google-cloud-dataproc-metainfo/{}/jobs/{}/driveroutput.000000000'
        .format(cluster_id, job_id))
    return bucket.blob(output_blob).download_as_string()


# [START create_cluster]
def create_cluster(dataproc, project, zone, region, cluster_name,
                   instance_type, master_nodes, worker_nodes):
    print('Creating cluster...')
    zone_uri = \
        'https://www.googleapis.com/compute/v1/projects/{}/zones/{}'.format(
            project, zone)
    cluster_data = {
        'projectId': project,
        'clusterName': cluster_name,
        'config': {
            'gceClusterConfig': {
                'zoneUri': zone_uri,
                "metadata": {
                    "CONDA_PACKAGES": "\"numpy pandas\"",
                    "MINICONDA_VARIANT": "2"
                }
            },
            "softwareConfig": {
                'properties': {
                    'spark:spark.jars.packages': 'com.databricks:spark-xml_2.11:0.4.1,graphframes:graphframes:0.5.0-spark2.1-s_2.11,com.databricks:spark-avro_2.11:4.0.0'
                }
            },
            'masterConfig': {
                'numInstances': master_nodes,
                'machineTypeUri': instance_type
            },
            'workerConfig': {
                'numInstances': worker_nodes,
                'machineTypeUri': instance_type
            },
            'secondaryWorkerConfig': {
                'numInstances': "2",
                'machineTypeUri': instance_type,
                "isPreemptible": "True"
            },
            "initializationActions": [
                {
                    "executableFile": "gs://dataproc-initialization-actions/conda/bootstrap-conda.sh"
                },
                {
                    "executableFile": "gs://dataproc-initialization-actions/conda/install-conda-env.sh"
                }
            ]
        }
    }
    result = dataproc.projects().regions().clusters().create(
        projectId=project,
        region=region,
        body=cluster_data).execute()
    return result
# [END create_cluster]


def wait_for_cluster_creation(dataproc, project_id, region, cluster_name):
    print('Waiting for cluster creation...')

    while True:
        result = dataproc.projects().regions().clusters().list(
            projectId=project_id,
            region=region).execute()
        cluster_list = result['clusters']
        cluster = [c
                   for c in cluster_list
                   if c['clusterName'] == cluster_name][0]
        if cluster['status']['state'] == 'ERROR':
            raise Exception(result['status']['details'])
        if cluster['status']['state'] == 'RUNNING':
            print("Cluster created.")
            break


# [START list_clusters_with_detail]
def list_clusters_with_details(dataproc, project, region):
    result = dataproc.projects().regions().clusters().list(
        projectId=project,
        region=region).execute()
    cluster_list = result['clusters']
    for cluster in cluster_list:
        print("{} - {}"
              .format(cluster['clusterName'], cluster['status']['state']))
    return result
# [END list_clusters_with_detail]


def get_cluster_id_by_name(cluster_list, cluster_name):
    """Helper function to retrieve the ID and output bucket of a cluster by
    name."""
    cluster = [c for c in cluster_list if c['clusterName'] == cluster_name][0]
    return cluster['clusterUuid'], cluster['config']['configBucket']


# [START submit_pyspark_job]
def submit_pyspark_job(dataproc, project, region,
                       cluster_name, bucket_name, filename):
    """Submits the Pyspark job to the cluster, assuming `filename` has
    already been uploaded to `bucket_name`"""
    job_details = {
        'projectId': project,
        'job': {
            'placement': {
                'clusterName': cluster_name
            },
            'pysparkJob': {
                'mainPythonFileUri': 'gs://{}/{}'.format(bucket_name, filename)
            }
        }
    }
    result = dataproc.projects().regions().jobs().submit(
        projectId=project,
        region=region,
        body=job_details).execute()
    job_id = result['reference']['jobId']
    print('Submitted job ID {}'.format(job_id))
    return job_id
# [END submit_pyspark_job]


# [START delete]
def delete_cluster(dataproc, project, region, cluster):
    print('Tearing down cluster')
    result = dataproc.projects().regions().clusters().delete(
        projectId=project,
        region=region,
        clusterName=cluster).execute()
    return result
# [END delete]


# [START wait]
def wait_for_job(dataproc, project, region, job_id):
    print('Waiting for job to finish...')
    while True:
        result = dataproc.projects().regions().jobs().get(
            projectId=project,
            region=region,
            jobId=job_id).execute()
        # Handle exceptions
        if result['status']['state'] == 'ERROR':
            raise Exception(result['status']['details'])
        elif result['status']['state'] == 'DONE':
            print('Job finished.')
            return result
# [END wait]


# [START get_client]
def get_client():
    """Builds an http client authenticated with the service account
    credentials."""
    dataproc = googleapiclient.discovery.build('dataproc', 'v1')
    return dataproc
# [END get_client]


def main(project_id, zone, cluster_name, bucket_name,
         instance_type, master_nodes, worker_nodes,
         pyspark_file=None, create_new_cluster=True):
    dataproc = get_client()
    region = 'global'
    try:
        if pyspark_file:
            spark_file, spark_filename = get_pyspark_file(pyspark_file)
        else:
            spark_file, spark_filename = get_default_pyspark_file()

        if create_new_cluster:
            create_cluster(
                dataproc, project_id, zone, region, cluster_name,
                instance_type, master_nodes, worker_nodes)
            wait_for_cluster_creation(
                dataproc, project_id, region, cluster_name)

        upload_pyspark_file(
            project_id, bucket_name, spark_filename, spark_file)

        cluster_list = list_clusters_with_details(
            dataproc, project_id, region)['clusters']

        (cluster_id, output_bucket) = (
            get_cluster_id_by_name(cluster_list, cluster_name))

        # [START call_submit_pyspark_job]
        job_id = submit_pyspark_job(
            dataproc, project_id, region,
            cluster_name, bucket_name, spark_filename)
        # [END call_submit_pyspark_job]
        wait_for_job(dataproc, project_id, region, job_id)

        output = download_output(project_id, cluster_id, output_bucket, job_id)
        print('Received job output {}'.format(output))
        return output
    finally:
        if create_new_cluster:
            delete_cluster(dataproc, project_id, region, cluster_name)
        spark_file.close()


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.RawDescriptionHelpFormatter
    )
    parser.add_argument(
            '--project_id', 
            help='Project ID you want to access.',
            required=True
        ),
    parser.add_argument(
            '--zone',
            help='Zone to create clusters in/connect to.',
            required=True
        ),
    parser.add_argument(
            '--cluster_name',
            help='Name of the cluster to create/connect to',
            required=True
        )
    parser.add_argument(
            '--gcs_bucket',
            help='Bucket to upload Pyspark file to',
            required=True
        )
    parser.add_argument(
            '--pyspark_file',
            help='Pyspark filename. Defaults to pyspark_sort.py'
        )
    parser.add_argument(
            '--create_new_cluster',
            action='store_true',
            help='States if the cluster should be created'
        )
    parser.add_argument(
            '--key_file',
            help='Location of your key file for service account'
        )
    parser.add_argument(
            '--instance_type',
            help='Instance types used for this cluster',
            default='n1-standard-8'
        )
    parser.add_argument(
            '--master_nodes',
            help='Number of master nodes',
            default=1
        )
    parser.add_argument(
            '--worker_nodes',
            help='Number of worker nodes',
            default=2
        )

    args = parser.parse_args()

    if args.key_file is not None:
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = args.key_file

    main(
        args.project_id, args.zone, args.cluster_name,
        args.gcs_bucket, args.instance_type, args.master_nodes, args.worker_nodes,
        args.pyspark_file, args.create_new_cluster)

Writing submit_job_to_cluster.py


In [3]:
!chmod a+x submit_job_to_cluster.py

### Example: h.t. create and run a spark job on a cluster using GCP
Run the cell below to create a file called `pyspark_sort.py` in the current directory. Then run the bash cell to submit this job to GCP & spin up a cluster. (__`Note:`__ _make sure you have all the global variables set up first including the name of the spark job if you change it._)

In [12]:
%%writefile LR.py
#!/usr/bin/env python
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

""" Sample pyspark script to be uploaded to Cloud Storage and run on
Cloud Dataproc.

Note this file is not intended to be run directly, but run inside a PySpark
environment.
"""

# [START pyspark]
# imports
import re
import numpy as np
import pandas as pd
from os import path
import time


from pyspark.sql import SparkSession

# create Spark Session
from pyspark.sql import SparkSession
app_name = "final_project"
master = "local[*]"
spark = SparkSession\
        .builder\
        .appName(app_name)\
        .master(master)\
        .getOrCreate()
sc = spark.sparkContext

#Load Parquet file into a dataframe
df = spark.read.parquet('gs://w261_sj_data/data/train')
#df = spark.read.parquet('sample_train')
# Define categorical and numerical columns
categoricalCols = []
numericalCols = []
numericalColsImputed = []
numericalColsLog = []
for c in range(2,41):
    col = "_"+str(c)
    colImp = str(c)+"_imp"
    colLog = str(c)+"_log"
    if (c < 15):
        numericalCols.append(col)
        numericalColsImputed.append(colImp)
        numericalColsLog.append(colLog)
    else:
        categoricalCols.append(col)

for col in numericalCols:
    df = df.withColumn(col, df[col].cast("double"))
#Also do this for column 0
df = df.withColumn("_1", df["_1"].cast("int"))                        
stages = [] # stages in our Pipeline

from pyspark.sql.functions import when
for col in numericalCols:
    df = df.withColumn(col, df[col].cast("double"))
    df= df.withColumn(col, when(df[col]<0, None).otherwise(df[col]))
#Also do this for column 0
df = df.withColumn("_1", df["_1"].cast("double"))

                        
from pyspark.ml.feature import Imputer
imputer = Imputer(inputCols=numericalCols, outputCols=numericalColsImputed)
stages += [imputer]
imputer_model = imputer.fit(df)
#df2 = imputer_model.transform(df)
#df2.select(numericalCols).show(50)
#df2.select(numericalColsImputed).show(50)

#Compute log transforms
from pyspark.ml.feature import SQLTransformer
#Is there a better way to do this?
sqlTrans = SQLTransformer(statement="SELECT *, \
                          log(2_imp+1) AS 2_log, \
                          log(3_imp+1) AS 3_log, \
                          log(4_imp+1) AS 4_log, \
                          log(5_imp+1) AS 5_log, \
                          log(6_imp+1) AS 6_log, \
                          log(7_imp+1) AS 7_log, \
                          log(8_imp+1) AS 8_log, \
                          log(9_imp+1) AS 9_log, \
                          log(10_imp+1) AS 10_log, \
                          log(11_imp+1) AS 11_log, \
                          log(12_imp+1) AS 12_log, \
                          log(13_imp+1) AS 13_log, \
                          log(14_imp+1) AS 14_log \
                          FROM __THIS__")
stages += [sqlTrans]
#df3 = sqlTrans.transform(df2)
#df3.select(numericalColsImputed).show(2,False)
#df3.select(numericalColsLog).show(2,False)

from pyspark.ml.feature import VectorAssembler
numericalAssembler = VectorAssembler(inputCols=numericalColsLog, outputCol="log_numerical_feature_vec")
stages += [numericalAssembler]
#df4 = numericalAssembler.transform(df3)
#df4.select("log_numerical_feature_vec").show(2,False)

#Now normalize these numerical features
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol="log_numerical_feature_vec", outputCol="scaled_features")
stages += [scaler]
                        
#Next transform Categorical Features with FeatureHasher
from pyspark.ml.feature import FeatureHasher
hasher = FeatureHasher(numFeatures=256, inputCols=categoricalCols,outputCol="categorical_features")
stages += [hasher]

#Now create vector with numerical and categorical features
finalAssembler = VectorAssembler(inputCols=["scaled_features", "categorical_features"], outputCol="features")
stages += [finalAssembler]

start = time.time()
#Finally create a pipeline and verify
from pyspark.ml import Pipeline
partialPipeline = Pipeline().setStages(stages)
pipelineModel = partialPipeline.fit(df)
preppedDataDF = pipelineModel.transform(df)
print(f"\n...Pipelines Completed in {time.time() - start} seconds")

# Keep relevant columns
selectedcols = ["_1", "features"]
dataset = preppedDataDF.select(selectedcols)

featureset = dataset.repartition(16).write.parquet("gs://w261_sj_data/data/feature_set")

start = time.time()
### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)
print(trainingData.count())
print(testData.count())
print(f"\n...Split Completed in {time.time() - start} seconds")
                        


#Run logistic regression
from pyspark.ml.classification import LogisticRegression

# Create initial LogisticRegression model
lr = LogisticRegression(labelCol="_1", featuresCol="features", maxIter=10)

start = time.time()

# Train model with Training Data
lrModel = lr.fit(trainingData)
                        
                        
predictions = lrModel.transform(testData)
print(f"\n...Initial Model Completed in {time.time() - start} seconds")


from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator

paramGrid = ParamGridBuilder() \
    .addGrid(lr.elasticNetParam, [0.0, 1.0]) \
    .addGrid(lr.regParam, [0.1, 0.05, 0.01]) \
    .build()
    
pipeline = Pipeline(stages=[lr])
crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(labelCol="_1"),
                          numFolds=5)  # use 3+ folds in practice
start = time.time()
cvModel = crossval.fit(trainingData)
cvprediction = cvModel.transform(testData)
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="_1")
evaluator.evaluate(cvprediction)
print(f"\n... Completed in {time.time() - start} seconds")
                        
                    
# [End pyspark]

Overwriting LR.py


In [13]:
!python3 submit_job_to_cluster.py \
    --project_id='w262-245821' \
    --zone=us-central1-b \
    --cluster_name='w261-sj' \
    --gcs_bucket='w261_sj_data' \
    --key_file=$HOME/w261.json \
    --create_new_cluster \
    --pyspark_file=LR.py

Creating cluster...
Waiting for cluster creation...
Cluster created.
Uploading pyspark file to GCS
w261-sj - RUNNING
Submitted job ID 805c03f5-13b6-46c0-ac0a-e513f4fa719c
Waiting for job to finish...
Job finished.
Downloading output file
Received job output b"Ivy Default Cache set to: /root/.ivy2/cache\nThe jars for the packages stored in: /root/.ivy2/jars\n:: loading settings :: url = jar:file:/usr/lib/spark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\ncom.databricks#spark-xml_2.11 added as a dependency\ngraphframes#graphframes added as a dependency\ncom.databricks#spark-avro_2.11 added as a dependency\n:: resolving dependencies :: org.apache.spark#spark-submit-parent-3a1f70c4-cebd-4918-a3d9-b93d90710d12;1.0\n\tconfs: [default]\n\tfound com.databricks#spark-xml_2.11;0.4.1 in central\n\tfound graphframes#graphframes;0.5.0-spark2.1-s_2.11 in spark-packages\n\tfound com.typesafe.scala-logging#scala-logging-api_2.11;2.1.2 in central\n\tfound com.typesafe.scala-logging

In [6]:
# global variables
PROJECT_ID = 'w262-245821' # fill in your GCP project id
BUCKET_NAME = 'w261_sj_data' # fill in the name of your GCP bucket
CLUSTER_NAME = 'w261-sj' # choose a cluster name, this should include only a-z, 0-9 & start with a letter
HOME = '/Users/sid'