# Getting data from GCP Cloud Storage and  BigQuery

In [1]:
from IPython.display import display, Markdown
with open('setup.md', 'r') as fh:
    content = fh.read()
display(Markdown(content))

Define the following env variable before starting Jupyter Lab:  
`export DIR_PROJ=your_path_git_repository`  
`export PYTHONPATH=$DIR_PROJ`  
`export PATH_TENSORBOARD=your_path_tensorboard`  
`export PATH_DATASETS=your_path_datasets`  
`export PROJECT_ID=your_gcp_project_id`  
`export BUCKET_NAME=your_gcp_gs_bucket_name`  
`export REGION=you_region`  
  
Start Jupyter Lab:  
`jupyter lab`  
  
Choose the proper Anaconda python environment:  
`Python [conda env:env_tensorflow]`  
    

## Import packages

In [2]:
from google.cloud import storage
import os
import pandas as pd

In [3]:
pd.__version__

'0.25.1'

## Getting data from Cloud Storage

In [4]:
client = storage.Client(project=os.environ['PROJECT_ID'])



In [5]:
for b in client.list_buckets():
    print(b.name[0:10],'...')

amld_2020 ...
artifacts. ...
dataflow-s ...
dataflow-s ...
dataflow-s ...
nlp-text-c ...
nlp-text-c ...
nlp-text-c ...
staging.nl ...
us.artifac ...


## Getting data from BigQuery  
https://googleapis.github.io/google-cloud-python/latest/bigquery/usage/pandas.html  
https://googleapis.github.io/google-cloud-python/latest/bigquery/index.html  

In [6]:
from google.cloud import bigquery
client = bigquery.Client(project=os.environ['PROJECT_ID'])



In [7]:
query = """SELECT
  EXTRACT(YEAR FROM creation_date) AS Year,
  COUNT(*) AS Number_of_Questions,
  ROUND(100 * SUM(IF(answer_count > 0, 1, 0)) / COUNT(*), 1) AS Percent_Questions_with_Answers
FROM
  `bigquery-public-data.stackoverflow.posts_questions`
GROUP BY
  Year
HAVING
  Year > 2008 AND Year < 2016
ORDER BY
  Year
"""

In [8]:
# get raw data
query_job = client.query(query)
rows = query_job.result()  # Waits for query to finish

for row in rows:
    print(row)

Row((2009, 342861, 99.6), {'Year': 0, 'Number_of_Questions': 1, 'Percent_Questions_with_Answers': 2})
Row((2010, 692888, 99.0), {'Year': 0, 'Number_of_Questions': 1, 'Percent_Questions_with_Answers': 2})
Row((2011, 1197767, 97.2), {'Year': 0, 'Number_of_Questions': 1, 'Percent_Questions_with_Answers': 2})
Row((2012, 1641742, 94.6), {'Year': 0, 'Number_of_Questions': 1, 'Percent_Questions_with_Answers': 2})
Row((2013, 2054622, 91.7), {'Year': 0, 'Number_of_Questions': 1, 'Percent_Questions_with_Answers': 2})
Row((2014, 2157464, 88.6), {'Year': 0, 'Number_of_Questions': 1, 'Percent_Questions_with_Answers': 2})
Row((2015, 2211624, 86.5), {'Year': 0, 'Number_of_Questions': 1, 'Percent_Questions_with_Answers': 2})


In [9]:
print("Keys: ", tuple(row.keys()))
print("Values: ", row.values())

Keys:  ('Year', 'Number_of_Questions', 'Percent_Questions_with_Answers')
Values:  (2015, 2211624, 86.5)


In [10]:
# get data in a Pandas datafrane
df = client.query(query).to_dataframe()
df

Unnamed: 0,Year,Number_of_Questions,Percent_Questions_with_Answers
0,2009,342861,99.6
1,2010,692888,99.0
2,2011,1197767,97.2
3,2012,1641742,94.6
4,2013,2054622,91.7
5,2014,2157464,88.6
6,2015,2211624,86.5
