In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from google.cloud import bigquery

# **BigQuery**

In [2]:
# client object
client = bigquery.Client()

# version
bigquery.__version__

Using Kaggle's public dataset BigQuery integration.


'2.2.0'

In [3]:
def about_dataset(dataset_id, project_id="bigquery-public-data"):
    """
    Description of any dataset in BigQuery.
    
    dataset_id > typr(str)
    project_id > type(str)
    """
    # client object
    client = bigquery.Client()
    # refrence for dataset
    dataset_ref = client.dataset(dataset_id,project= project_id)
    # Api request - fetch the dataset:
    dataset = client.get_dataset(dataset_ref)
    # about dataset
    print(dataset.description)
    

def list_table_id(dataset_id, project_id="bigquery-public-data"):
    """
    Return list of table_id in dataset.
    
    dataset_id > typr(str)
    project_id > type(str)
    """
    # client object
    client = bigquery.Client()
    # refrence for dataset
    dataset_ref = client.dataset(dataset_id,project= project_id)
    # Api request - fetch the dataset:
    dataset = client.get_dataset(dataset_ref)
    tables = client.list_tables(dataset)
    print(f"table_name_id in {dataset_id} dataset\n")
    for table in tables:
        print(f"table_name_id : {table.table_id}")

def about_table(table_id,dataset_id, project_id="bigquery-public-data"):
    """
    Description of table of dataset.
    
    dataset_id > typr(str)
    project_id > type(str)
    table_id   >   type(str)
    """
    # client object
    client = bigquery.Client()
    # refrence for dataset
    dataset_ref = client.dataset(dataset_id,project= project_id)
    # Api request - fetch the dataset:
    dataset = client.get_dataset(dataset_ref)
    
    table_ref = dataset_ref.table(table_id)
    # API -reguest - fetch the table :
    table = client.get_table(table_ref)
    # about table :
    print(table.description)

def about_table_column(table_id,dataset_id, project_id="bigquery-public-data"):
    """
    Description about column(field) of any table of dataset.
    Each `SchemaField` tells us about all tha column(field) name,type,mode & description in table.
    
    dataset_id > typr(str)
    project_id > type(str)
    table_id   > type(str)
    """
    # client object
    client = bigquery.Client()
    # refrence for dataset
    dataset_ref = client.dataset(dataset_id,project= project_id)
    # Api request - fetch the dataset:
    dataset = client.get_dataset(dataset_ref)
    
    table_ref = dataset_ref.table(table_id)
    # API -reguest - fetch the table :
    table = client.get_table(table_ref)
    # about column of table :
    return table.schema
    

def fetch_table(table_id,dataset_id, project_id="bigquery-public-data"):
    """
    fetch tables of dataset
    
    dataset_id > typr(str)
    project_id > type(str)
    table_id   > type(str)
    """
    client = bigquery.Client()
    dataset_ref = client.dataset(dataset_id, project = project_id)
    dataset = client.get_dataset(dataset_ref)
    table_ref = dataset_ref.table(table_id)
    table = client.get_table(table_ref)
    return table

In [4]:
about_dataset("openaq")

Using Kaggle's public dataset BigQuery integration.
OpenAQ is an open-source project to surface live, real-time air quality data from around the world. Their “mission is to enable previously impossible science, impact policy and empower the public to fight air pollution.” The data includes air quality measurements from 5490 locations in 47 countries.

Scientists, researchers, developers, and citizens can use this data to understand the quality of air near them currently. The dataset only includes the most current measurement available for the location (no historical data). 

Dataset Source: openaq.org

Category: Science

Use: This dataset is publicly available for anyone to use under the following terms provided by the Dataset Source — https://openaq.org/#/about?_k=s3aspo — and is provided "AS IS" without any warranty, express or implied, from Google. Google disclaims all liability for any damages, direct or indirect, resulting from the use of the dataset. 

Update Frequency: Hourly


In [5]:
list_table_id("openaq")

Using Kaggle's public dataset BigQuery integration.
table_name_id in openaq dataset

table_name_id : global_air_quality


In [6]:
about_table("global_air_quality","openaq")

Using Kaggle's public dataset BigQuery integration.
None


In [7]:
about_table_column("global_air_quality","openaq")

Using Kaggle's public dataset BigQuery integration.


[SchemaField('location', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('city', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('country', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('pollutant', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('value', 'FLOAT', 'NULLABLE', None, (), None),
 SchemaField('timestamp', 'TIMESTAMP', 'NULLABLE', None, (), None),
 SchemaField('unit', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('source_name', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('latitude', 'FLOAT', 'NULLABLE', None, (), None),
 SchemaField('longitude', 'FLOAT', 'NULLABLE', None, (), None),
 SchemaField('averaged_over_in_hours', 'FLOAT', 'NULLABLE', None, (), None),
 SchemaField('location_geom', 'GEOGRAPHY', 'NULLABLE', None, (), None)]

In [8]:
table = fetch_table("global_air_quality","openaq")

# first 5 rows in "full" table:
df = client.list_rows(table,max_results=500).to_dataframe()

Using Kaggle's public dataset BigQuery integration.


In [9]:
df.head()

Unnamed: 0,location,city,country,pollutant,value,timestamp,unit,source_name,latitude,longitude,averaged_over_in_hours,location_geom
0,"Borówiec, ul. Drapałka",Borówiec,PL,bc,0.85217,2022-04-28 07:00:00+00:00,µg/m³,GIOS,1.0,52.276794,17.074114,POINT(52.276794 1)
1,"Kraków, ul. Bulwarowa",Kraków,PL,bc,0.91284,2022-04-27 23:00:00+00:00,µg/m³,GIOS,1.0,50.069308,20.053492,POINT(50.069308 1)
2,"Płock, ul. Reja",Płock,PL,bc,1.41,2022-03-30 04:00:00+00:00,µg/m³,GIOS,1.0,52.550938,19.709791,POINT(52.550938 1)
3,"Elbląg, ul. Bażyńskiego",Elbląg,PL,bc,0.33607,2022-05-03 13:00:00+00:00,µg/m³,GIOS,1.0,54.167847,19.410942,POINT(54.167847 1)
4,"Piastów, ul. Pułaskiego",Piastów,PL,bc,0.51,2022-05-11 05:00:00+00:00,µg/m³,GIOS,1.0,52.191728,20.837489,POINT(52.191728 1)


In [10]:
client.list_rows(table, max_results=5,
                 selected_fields=table.schema[:1]).to_dataframe()

Unnamed: 0,location
0,"Borówiec, ul. Drapałka"
1,"Kraków, ul. Bulwarowa"
2,"Płock, ul. Reja"
3,"Elbląg, ul. Bażyńskiego"
4,"Piastów, ul. Pułaskiego"


In [11]:
df.city.value_counts()

Kraków                  21
Łódź                    18
Rzeszów                 17
Płock                   16
Zielonka                14
                        ..
Czerwionka-Leszczyny     3
Złoty Potok              3
Nakło nad Notecią        3
Kalisz                   3
Kościerzyna              3
Name: city, Length: 64, dtype: int64

# **SQL (Structured Query Language)**
### **1. Select, From & Where**

In [12]:
query = """
        SELECT city
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE country = 'US'
        """

query_job = client.query(query)
us_cities = query_job.to_dataframe()

In [13]:
us_cities.head()

Unnamed: 0,city
0,HOWARD
1,HOWARD
2,HOWARD
3,HOWARD
4,HOWARD


In [14]:
query_1 = """
        SELECT city
        FROM `bigquery-public-data.openaq.global_air_quality`
        WHERE country = 'IN'
        """
query_1_job = client.query(query_1)
india_city = query_1_job.to_dataframe()

In [15]:
india_city.head()

Unnamed: 0,city
0,Tirupati
1,Indore
2,Bilaspur
3,Visakhapatnam
4,Darbhanga


In [16]:
india_city.value_counts().sort_values(ascending=False).head()

city     
Delhi        78586
Mumbai       42833
Bengaluru    19668
Chennai      19130
Ahmedabad    18897
dtype: int64

In [17]:
query_2 = """ 
          SELECT city, country, value
          FROM `bigquery-public-data.openaq.global_air_quality`
          WHERE country = "IN"
          
          """
query_2_job = client.query(query_2)
query_2_df = query_2_job.to_dataframe()

In [18]:
query_2_df.head()

Unnamed: 0,city,country,value
0,Ambala,IN,490.0
1,Srinagar,IN,0.0
2,Davanagere,IN,140.0
3,Purnia,IN,1180.0
4,Hisar,IN,710.0


In [19]:
query_3 = """
          SELECT *
          FROM `bigquery-public-data.openaq.global_air_quality`
          WHERE country = "IN"
          """
query_3_job = client.query(query_3)
query_3_df = query_3_job.to_dataframe()

In [20]:
query_3_df.head()

Unnamed: 0,location,city,country,pollutant,value,timestamp,unit,source_name,latitude,longitude,averaged_over_in_hours,location_geom
0,"Patti Mehar, Ambala - HSPCB",Ambala,IN,co,490.0,2022-05-15 01:45:00+00:00,µg/m³,caaqm,0.25,30.379589,76.778328,POINT(30.379589 0.25)
1,"Rajbagh, Srinagar - JKSPCB",Srinagar,IN,co,0.0,2022-05-04 15:15:00+00:00,µg/m³,caaqm,0.25,34.066206,74.81982,POINT(34.066206 0.25)
2,"Devaraj Urs Badavane, Davanagere - KSPCB",Davanagere,IN,co,140.0,2022-05-13 08:00:00+00:00,µg/m³,caaqm,0.25,14.4758,75.9052,POINT(14.4758 0.25)
3,"Mariam Nagar, Purnia - BSPCB",Purnia,IN,co,1180.0,2022-05-05 13:45:00+00:00,µg/m³,caaqm,0.25,25.366336,87.117468,POINT(25.366336 0.25)
4,"Urban Estate-II, Hisar - HSPCB",Hisar,IN,co,710.0,2022-05-15 22:45:00+00:00,µg/m³,caaqm,0.25,29.14056,75.744941,POINT(29.14056 0.25)
