## Reading/Writing data to Google Storage

- Authenticate to Google
- Install gcsfs and mount
- Install gcsfuse and mount the bucket as a file system
- Read data from bucket
- Write data to bucket


# Authenticate to Google

In [None]:
from google.colab import auth
auth.authenticate_user()

project_id = 'data-eng-dev-437916'
!gcloud config set project {project_id}

# Install gcsfs and pyspark

In [None]:
!pip install gcsfs
!pip install pyspark

# Install gcsfuse

In [None]:
!echo "deb http://packages.cloud.google.com/apt gcsfuse-bionic main" > /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
!apt -qq update
!apt -qq install gcsfuse

 # Create local folder and mount the bucket as a file system

In [None]:
!mkdir edit-data-eng-dev
!gcsfuse edit-data-eng-dev edit-data-eng-dev

# Create Spark Session

In [None]:
from pyspark.sql import SparkSession

# .config("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
# .config("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
# .config("spark.hadoop.fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS") \

spark = (SparkSession.builder \
    .appName("ColabGCS") \
    .getOrCreate())

In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

In [None]:

# define paths
bucket_name="edit-data-eng-dev"
lake_path="datalake/bronze"
table_path="basic_pays"
final_path=f"gs://{bucket_name}/{lake_path}/{table_path}"

# since we're mounting the bucket as filesystem , the new path will be:
# "/content/edit-data-eng-dev/datalake/bronze/basic_pays"
# instead of
# "gs://edit-data-eng-dev/datalake/bronze/basic_pays"


# Read data from the bucket

In [None]:
df = spark.read.parquet("/content/edit-data-eng-dev/datalake/bronze/basic_pays")
df.show()

# Write data to the bucket

In [None]:
df.write.format("parquet").save("/content/edit-data-eng-dev/datalake/bronze/basic_pays_new")

# Copying data through gsutils

In [None]:
!gsutil cp gs://edit-data-eng-dev/datalake/bronze/basic_pays/* gs://edit-data-eng-dev/datalake/bronze3/