Pre requisites:
-
Have a gcp project with a linked billing account
-
Open up cloud shell
-
have pip installed
curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
sudo apt-get install python3 python3-setuptools
sudo python3 get-pip.py
export project="your-gcp-project-id" #change this to your project
export region="your-gcp-region" #for example us-central1
export bq_dataset="dataflow_example"
gsutil mb -p $project -c regional -l $region -b on gs://$project-df-template/
gsutil mb -p $project -c regional -l $region -b on gs://$project-df-files/
bq mk --location=us --dataset $project:$bq_dataset
git clone https://github.com/thomas-vl/dataflow-gcs-cf.git
sudo pip3 install apache-beam[gcp]
cd ~/dataflow-gcs-cf/dataflow
python3 -m main --output $project:$bq_dataset.example --runner DataflowRunner --project $project \
--staging_location gs://$project-df-template/staging --temp_location gs://$project-df-template/temp \
--template_location gs://$project-df-template/templates/df-bq
validate if the template file exists:
gsutil ls gs://$project-df-template/templates/
gcloud services enable dataflow.googleapis.com --project $project
cd ~/dataflow-gcs-cf/cloud-functions
gcloud functions deploy start_dataflow --runtime python37 --trigger-resource $project-df-files --trigger-event google.storage.object.finalize --project $project --region $region
##upload the file
cd ~/dataflow-gcs-cf/
gsutil cp titanic.csv gs://$project-df-files