In [None]:
import vectice as vct

vec_project = vct.connect(config="token_i.json")

### Read datasets:
 - PTY_ID_MAIN - From our BigQuery Dev environment
 - HIST_TRANS - From our BigQuery Dev environment
 - LuxAir_Accts - S3
 - OFAC_SDN - S3

In [None]:
# Connect to BigQuery Dev using Service Account
from google.cloud import bigquery
from google.oauth2 import service_account
creds = service_account.Credentials.from_service_account_file("bq_dev_sa.json", scopes=["https://www.googleapis.com/auth/cloud-platform"])

bigquery_client = bigquery.Client(
    credentials= creds,
    project=creds.project_id
)

Query PTY_ID_MAIN and HIST_TRX from our Dev BigQuery env. Retrieving full tables, we will need to remove non US customers from the resultsets as per compliance.

In [None]:
# Query PTY_ID_MAIN table
qry_PTY_ID_MAIN = "SELECT * FROM `solutions-engineering-363108.CUST_PTY_INFO.PTY_ID_MAIN`"
#Run the query and write result to a pandas data frame
Query_Results = bigquery_client.query(qry_PTY_ID_MAIN)
df_PTY_ID_MAIN = Query_Results.to_dataframe()
#View top few rows of result
df_PTY_ID_MAIN.head()

In [None]:
# Query HIST_TRANS table
qry_HIST_TRX = "SELECT * FROM `solutions-engineering-363108.HIST_CUST_INFO.HIST_TRANS`"
#Run the query and write result to a pandas data frame
Query_Results = bigquery_client.query(qry_HIST_TRX)
df_HIST_TRX = Query_Results.to_dataframe()
#View top few rows of result
df_HIST_TRX.head()

Reading the two external files from our S3 bucket.

In [None]:
# Read the external files from S3
# Create connection
from boto3 import client
from botocore import UNSIGNED
from botocore.client import Config
import s3fs

s3_client = client('s3', config=Config(signature_version=UNSIGNED), region_name='us-west-1')


In [None]:
import pandas as pd
# Read the external files in dataframes
s3 = s3fs.S3FileSystem(anon=True)

with s3.open("vectice-examples/Samples Data/LuxAir_Accts.csv", mode="rb") as f:
    df_LuxAir_Accts = pd.read_csv(f)

with s3.open("vectice-examples/Samples Data/OFAC_SDN.csv", mode="rb") as f:
    df_OFAC_SDN = pd.read_csv(f)

Document my findings in Vectice

In [None]:
from vectice import Dataset, S3Resource
from vectice.models.resource import BigQueryResource

iteration = vec_project.phase("Data Collection").iteration()

In [None]:
vct_PTY_ID_MAIN = BigQueryResource (bq_client=bigquery_client, path="solutions-engineering-363108.CUST_PTY_INFO.PTY_ID_MAIN")
vct_HIST_TRX = BigQueryResource (bq_client=bigquery_client, path="solutions-engineering-363108.HIST_CUST_INFO.HIST_TRANS")

vct_LuxAir_Accts = S3Resource(s3_client,bucket_name="vectice-examples", resource_path="Samples Data/LuxAir_Accts.csv")
vct_OFAC_SDN = S3Resource(s3_client,bucket_name="vectice-examples", resource_path="Samples Data/OFAC_SDN.csv")

In [None]:


# Documenting all four datasets used in the project
iteration.step_identify_datasets = Dataset.origin(name="PTY_ID_MAIN", resource=vct_PTY_ID_MAIN, dataframe = df_PTY_ID_MAIN, properties={"SQL":qry_PTY_ID_MAIN})
iteration.step_identify_datasets += Dataset.origin(name="HIST_TRANSACTIONS", resource=vct_HIST_TRX, dataframe = df_HIST_TRX, properties={"SQL":qry_HIST_TRX})
iteration.step_identify_datasets += Dataset.origin(name="LuxAir_Accts", resource=vct_LuxAir_Accts, dataframe = df_LuxAir_Accts)
iteration.step_identify_datasets += Dataset.origin(name="OFAC_SDN", resource=vct_OFAC_SDN, dataframe = df_OFAC_SDN)

iteration.step_identify_datasets = "We have identified the proper datasets for this project. \nTwo of the datasets (\"LuxAir_Accts\" and \"OFAC_SDN\") are coming from external sources and are dropped weekly on our S3 bucket. These files will need to be automated."

Capture data summary - Describe data, check for N/A, etc...

In [None]:
df_PTY_ID_MAIN.describe()

In [None]:
df_PTY_ID_MAIN.shape[0]


In [None]:
df_PTY_ID_MAIN.shape[1]

In [None]:
df_PTY_ID_MAIN.isnull().sum().sum()

In [None]:
df_HIST_TRX.describe()

In [None]:
df_HIST_TRX.shape[0]

In [None]:
df_HIST_TRX.shape[1]

In [None]:
df_HIST_TRX.isnull().sum().sum()

In [None]:
# Log insights in Vectice
msg = "\nSize of Original Dataset:\n"\
"PTY_ID_MAIN: Observations: " + str(df_PTY_ID_MAIN.shape[0]) + " - Features: " + str(df_PTY_ID_MAIN.shape[1])  + "- # of null values: " + str(df_PTY_ID_MAIN.isnull().sum().sum()) + "\n" \
"HIST_TRX: Observations: " + str(df_HIST_TRX.shape[0])  + "- Features: " + str(df_HIST_TRX.shape[1]) + "- # of null values: " + str(df_HIST_TRX.isnull().sum().sum()) + "\n" \
"LuxAir_Accts: Observations: " + str(df_LuxAir_Accts.shape[0])  + " - Features: " + str(df_LuxAir_Accts.shape[1]) + "- # of null values: " + str(df_LuxAir_Accts.isnull().sum().sum()) + "\n" \
"OFAC_SDN: Observations: " + str(df_OFAC_SDN.shape[0])  + " - Features: " + str(df_OFAC_SDN.shape[1]) + "- # of null values: " + str(df_OFAC_SDN.isnull().sum().sum())

iteration.step_describe_data = "The data properties have been reviewed for the datasets identified\n" + msg

Visualize the data

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sb

df_PTY_ID_MAIN.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False)
plt.savefig("PTY_ID_MAIN_boxplot.jpg")
plt.show()

df_PTY_ID_MAIN.hist()
histogram = plt.savefig("PTY_ID_MAIN_histogram.jpg")
plt.show()

In [None]:
print("HIST_TRX Visualizations:")

df_HIST_TRX.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False)
plt.savefig("HIST_TRX_boxplot.jpg")
plt.show()
df_HIST_TRX.hist()
histogram = plt.savefig("HIST_TRX_histogram.jpg")
plt.show()

In [None]:
print("LuxAir_Accts Visualizations:")

df_LuxAir_Accts.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False)
plt.savefig("LuxAir_Accts_boxplot.jpg")
plt.show()
df_LuxAir_Accts.hist()
histogram = plt.savefig("LuxAir_Accts_histogram.jpg")
plt.show()

In [None]:
# Capture the visualizations in Vectice

iteration.step_explore_data = "PTY_ID_MAIN_plot.jpg"
#iteration.step_explore_data += "PTY_ID_MAIN_boxplot.jpg"
#iteration.step_explore_data += "PTY_ID_MAIN_histogram.jpg"

#iteration.step_explore_data += "HIST_TRX_plot.jpg"
#iteration.step_explore_data += "HIST_TRX_boxplot.jpg"
#iteration.step_explore_data += "HIST_TRX_histogram.jpg"

#iteration.step_explore_data += "LuxAir_Accts_plot.jpg"
#iteration.step_explore_data += "LuxAir_Accts_boxplot.jpg"
#iteration.step_explore_data += "LuxAir_Accts_histogram.jpg"



In [None]:
iteration.complete()