In [None]:
from vectice import connect
vec_project = connect(config="token.json")
iteration = vec_project.phase("Modeling").create_iteration()

# Vectice auto-document your model

### Read dataset for modeling:
 - modelingdata - Saved from DataPrep to BQ


In [None]:
# Connect to BigQuery Dev using Service Account
from google.cloud import bigquery
from google.oauth2 import service_account
creds = service_account.Credentials.from_service_account_file("bq_dev_sa.json", scopes=["https://www.googleapis.com/auth/cloud-platform"])

bigquery_client = bigquery.Client(
    credentials= creds,
    project=creds.project_id
)

Query PTY_ID_MAIN and HIST_TRX from our Dev BigQUery env. Retrieving full tables, we will need to remove non US customers from the resultsets as per compliance.

In [None]:
# Query ModelingData table
qry_ModelingDS = "SELECT * FROM `solutions-engineering-363108.FFBank_CoBranded_project.modelingdata`"
#Run the query and write result to a pandas data frame
Query_Results = bigquery_client.query(qry_ModelingDS)
df_qry_ModelingDS = Query_Results.to_dataframe()
#View top few rows of result
df_qry_ModelingDS.head()

Split the modeling dataset into training, testing and validation datasets

In [None]:
# import some essential math libraries
import pandas as pd; import matplotlib.pyplot as plt; import numpy as np
import plotly.offline as py; from matplotlib import pyplot as plt
import IPython.display
%matplotlib inline
py.init_notebook_mode(connected=True)

# load scikit-learn modeling packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error
import seaborn as sb

# specify how much of the dataset to set aside for testing
test_size = 0.40
# specify a seed value so we can always generate the same split
random_state = 42

# Generate X_train, X_test, y_train, y_test, which we will need for modeling
X = df_qry_ModelingDS.drop(['accepted_offer'], axis=1)
y = df_qry_ModelingDS["accepted_offer"]
X_train, X_test, y_train, y_test =\
train_test_split(X, y, test_size = test_size, random_state = random_state)

# save the 3 split datasets locally
X_train.to_csv("traindataset.csv")
X_test.to_csv("testdataset.csv")
y_test.to_csv("validatedataset.csv")

In [None]:
# Capture model dataset insights
msg = f"We split the dataset in a training, testing and validation datasets. "\
      f"{test_size * 100}% of the data is set aside for testing.\n "\
      f"- Training dataset size: {X_train.shape[0]}\n "\
      f"- Testing dataset size: {X_test.shape[0]}\n "\
      f"- Validation dataset size: {y_test.shape[0]}\n"\
      f"Our seed to generate repeatable datasets is {random_state}"
iteration.step_generate_test_design = msg

## Create our Linear Regression model

In [None]:
# create a linear regression model
model_linreg = LinearRegression()

X_train = X_train.apply(pd.to_numeric, errors='coerce')
y_train = y_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

X_train.fillna(0, inplace=True)
y_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

model_linreg.fit(X_train.values, y_train.values)

# evaluate, define and save the RMSE and MAE summary statistics
pred = model_linreg.predict(X_test.values)    
RMSE = np.sqrt(mean_squared_error(y_test.values, pred))
MAE = mean_absolute_error(y_test.values, pred)

# the metrics object holds our two key summary statistics
summary_stats = {"RMSE": RMSE, "MAE": MAE}

# finally, generate and save a pretty plot
plt.ioff()
plt.scatter(X_train.iloc[:,0].values, y_train ,color='g')
plt.plot(X_test, pred,color='k')

# Vectice Documentation Pipeline

In [31]:
from vectice import FileResource, Dataset, Model
iteration.step_select_modeling_techniques = "For this first iteration we are going to use a Linear Regression model to get a base model."

Added Comment to Step: Select Modeling Techniques

Link to Step: https://app.vectice.com/project/phase/iteration?w=1952&iterationId=6528



In [32]:
dataset = Dataset.modeling(
    name="Modeling dataset", 
    training_resource=FileResource(paths="traindataset.csv", dataframes=X_train), 
    testing_resource=FileResource(paths="testdataset.csv", dataframes=X_test)
    )
iteration.step_generate_test_design = dataset

File: traindataset.csv wrapped successfully.
File: testdataset.csv wrapped successfully.
New Version: 'Version 5' of Dataset: 'Modeling dataset' added to Step: Generate Test Design
Attachments: None
Link to Step: https://app.vectice.com/project/phase/iteration?w=1952&iterationId=6528



In [33]:
# Catalog the model
model = Model(
    name="AcceptancePredictor", 
    library= "scikit-learn", 
    technique="linear regression", 
    metrics=summary_stats, 
    attachments=["26_LiftChart_holdout.png","Confusion_Matrix.png"], 
    predictor=model_linreg, 
    derived_from=[dataset.latest_version_id]
    )
iteration.step_build_model = model
iteration.complete()

Model LinearRegression successfully attached to Model(name='AcceptancePredictor', version='Version 16').
New Version: 'Version 16' of Model: 'AcceptancePredictor' added to Step: Build Model
Attachments: 26_LiftChart_holdout.png, Confusion_Matrix.png
Link to Step: https://app.vectice.com/project/phase/iteration?w=1952&iterationId=6528

Iteration with index 18 completed.

For quick access to the Iteration in the Vectice web app, visit:
https://app.vectice.com/project/phase/iteration?w=1952&iterationId=6528
