# PySpark notebook 
## from the gcloud dataproc cluster's web interface JupyterLab.

<img src="dataproc_jupyterLab_interface.png" width="800" />

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, ArrayType, FloatType, TimestampType

In [None]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("JsonToBigQuery") \
    .getOrCreate()

In [4]:
spark

In [5]:
# Define JSON schema
json_schema = StructType([
    StructField("nhits", IntegerType()),
    StructField("parameters", StructType([
        StructField("dataset", StringType()),
        StructField("rows", IntegerType()),
        StructField("start", IntegerType()),
        StructField("format", StringType()),
        StructField("timezone", StringType())
    ])),
    StructField("records", ArrayType(StructType([
        StructField("datasetid", StringType()),
        StructField("recordid", StringType()),
        StructField("fields", StructType([
            StructField("nbvelosdispo", IntegerType()),
            StructField("nbplacesdispo", IntegerType()),
            StructField("libelle", StringType()),
            StructField("adresse", StringType()),
            StructField("nom", StringType()),
            StructField("etat", StringType()),
            StructField("commune", StringType()),
            StructField("etatconnexion", StringType()),
            StructField("type", StringType()),
            StructField("geo", ArrayType(FloatType())),
            StructField("localisation", ArrayType(FloatType())),
            StructField("datemiseajour", TimestampType())
        ])),
        StructField("geometry", StructType([
            StructField("type", StringType()),
            StructField("coordinates", ArrayType(FloatType()))
        ])),
        StructField("record_timestamp", TimestampType())
    ])))
])

In [6]:
import subprocess

def list_files_in_bucket(bucket_name):
    # Run the gsutil ls command and capture the output
    command = f"gsutil ls gs://{bucket_name}"
    try:
        # Run the command and capture the output as a byte string
        output = subprocess.check_output(command, shell=True)
        
        # Decode the byte string to a regular string and split it into lines
        file_paths = output.decode("utf-8").strip().split("\n")
        
        # Return the list of file paths
        return file_paths
    except subprocess.CalledProcessError as e:
        # Handle any errors that occurred during the command execution
        print(f"Error: {e}")
        return []

In [10]:
# Specify your bucket name
bucket_name = "vlille_data_json"

# Call the function to get the list of files in the bucket
file_paths = list_files_in_bucket(bucket_name)

# Print the list of file paths
# print("List of files in the bucket:")
n_files = len(file_paths)
print(n_files, 'files, to insert', n_files*289, 'rows in bigquery')

91549 files, to insert 26457661 rows in bigquery


In [11]:
# Read JSON data from Google Cloud Storage
json_data = spark.read.schema(json_schema).json(file_paths)

                                                                                

In [12]:
json_data

DataFrame[nhits: int, parameters: struct<dataset:string,rows:int,start:int,format:string,timezone:string>, records: array<struct<datasetid:string,recordid:string,fields:struct<nbvelosdispo:int,nbplacesdispo:int,libelle:string,adresse:string,nom:string,etat:string,commune:string,etatconnexion:string,type:string,geo:array<float>,localisation:array<float>,datemiseajour:timestamp>,geometry:struct<type:string,coordinates:array<float>>,record_timestamp:timestamp>>]

In [13]:
# Flatten the nested JSON structure
flattened_data = json_data.select(col("records.fields.nbvelosdispo").alias("nb_available_bikes"),
                                  col("records.fields.nbplacesdispo").alias("nb_available_places"),
                                  col("records.fields.libelle").alias("station_id"),
                                  col("records.fields.etat").alias("operational_state"),
                                  col("records.fields.etatconnexion").alias("connexion"),
                                  col("records.fields.datemiseajour").alias("datemiseajour"),
                                  col("records.record_timestamp").alias("record_timestamp"))

# Write data to BigQuery
flattened_data.write \
    .format("bigquery") \
    .mode("overwrite") \
    .option("temporaryGcsBucket", "dataproc_test_yzpt") \
    .option("parentProject", "zapart-data-vlille") \
    .option("table", "zapart-data-vlille.vlille_dataset.dataproc_test") \
    .save()

                                                                                

In [None]:
# duration : 20min @ 22k rows/s

In [None]:
# Stop the Spark session
spark.stop()