In [7]:
from view_demo.utils import run_and_save, get_project_id

In [3]:
SRCROOT='../'

In [6]:
%%writefile {SRCROOT}/preprocess/create_dataset.py
import os
import datetime

import IPython
import IPython.display
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from google.cloud import bigquery
from google.cloud.exceptions import NotFound

import tempfile
import argparse
import sys
import os


parser = argparse.ArgumentParser()
parser.add_argument('--raw-dataset', dest='raw_dataset',
                    default="None", type=str, help='GCS Path to the raw dataset')
parser.add_argument('--project-id', dest='project_id',
                    default=get_project_id(), type=str, help='Project ID')
parser.add_argument('--target-dataset', dest='target_dataset',
                    default="view_dataset", type=str, help='Name of the BQ Dataset where preprocessed data will be pushed')
parser.add_argument('--target-table', dest='target_table',
                    default="weather_time_series", type=str, help='Name of the table under' 
                    'the BQ Dataset where preprocessed data will be pushed')
args = parser.parse_args()


# Read the dataset into a dataframe
csv_path = args.raw_dataset
dataset_id = args.target_dataset
table_id = args.target_table
project_id = args.project_id


df = pd.read_csv(csv_path)

# Convert to hourly dataset
# slice [start:stop:step], starting from index 5 take every 6th record.
df = df[5::6]


# Clean Data
wv = df['wv (m/s)']
bad_wv = wv == -9999.0
wv[bad_wv] = 0.0

max_wv = df['max. wv (m/s)']
bad_max_wv = max_wv == -9999.0
max_wv[bad_max_wv] = 0.0

# The above inplace edits are reflected in the DataFrame
df['wv (m/s)'].min()


# Rename Columns to comply with BQ
df.rename(columns={
    'p (mbar)': 'p__mbar', 
    'T (degC)': 'T__degC',
    'Tpot (K)': 'Tpot__K', 
    'Tdew (degC)': 'Tdew__degC', 
    'rh (%)': 'rh__percent',
    'VPmax (mbar)': 'VPmax__mbar' ,
    'VPact (mbar)': 'VPact__mbar',
    'VPdef (mbar)': 'VPdef__mbar',
    'sh (g/kg)': 'sh__g_per_kg',
    'H2OC (mmol/mol)': 'H2OC__mmol_per_mol',
    'rho (g/m**3)': 'rho__gm_per_cubic_m',
    'max Wx': 'max_Wx',
    'max Wy': 'max_Wy', 
    'Day sin': 'Day_sin', 
    'Day cos': 'Day_cos', 
    'Year sin': 'Year_sin', 
    'Year cos': 'Year_cos'
    
}, inplace=True)

# Write to BQ
client = bigquery.Client(location="us-central1")
print("Client creating using default project: {}".format(client.project))

# The project defaults to the Client's project if not specified.
try:
    dataset = client.get_dataset(dataset_id)  # Make an API request.
    print("Dataset {} already exists".format(dataset_id))
except NotFound:
    print("Dataset {} is not found, Creating..".format(dataset_id))
    dataset = client.create_dataset(dataset_id)
    
table_ref = dataset.table(table_id)

job_config = bigquery.LoadJobConfig(
    destination=table_ref,
    autodetect=True,
)
# Overwrite the table if already exists
job_config.write_disposition = 'WRITE_TRUNCATE'

job = client.load_table_from_dataframe(df, table_ref, location="us-central1")
job.result()  # Waits for table load to complete.
print("Loaded dataframe to {}".format(table_ref.path))
