In [11]:
from vectice import connect
vec_project = connect(config="token.json")
iteration = vec_project.phase("Modeling").create_iteration()

VECTICE_API_ENDPOINT is deprecated and will be removed in 23.3.1.0, please use VECTICE_HOST instead.
Welcome, 'Eric Barre'. You`re now successfully connected to the project '_FFBank & LuxAir Co-Branded Credit Card Initial offer' in Vectice.

To access a specific phase, use [1mproject[0m.phase(Phase ID)
To get a list of phases you can access and their IDs, use [1mproject[0m.list_phases()

For quick access to the list of phases in the Vectice web app, visit:
https://app.vectice.com/browse/project/PRJ-189
Phase 'Modeling' successfully retrieved."

For quick access to the Phase in the Vectice web app, visit:
https://app.vectice.com/browse/phase/PHA-1074
New Iteration number 33 created.

For quick access to the Iteration in the Vectice web app, visit:
https://app.vectice.com/browse/iteration/ITR-2054


# Vectice auto-document your model

### Read dataset for modeling:
 - modelingdata - Saved from DataPrep to BQ


In [12]:
# Connect to BigQuery Dev using Service Account
from google.cloud import bigquery
from google.oauth2 import service_account
creds = service_account.Credentials.from_service_account_file("bq_dev_sa.json", scopes=["https://www.googleapis.com/auth/cloud-platform"])

bigquery_client = bigquery.Client(
    credentials= creds,
    project=creds.project_id
)

Query PTY_ID_MAIN and HIST_TRX from our Dev BigQUery env. Retrieving full tables, we will need to remove non US customers from the resultsets as per compliance.

In [13]:
# Query ModelingData table
qry_ModelingDS = "SELECT * FROM `solutions-engineering-363108.FFBank_CoBranded_project.modelingdata`"
#Run the query and write result to a pandas data frame
Query_Results = bigquery_client.query(qry_ModelingDS)
df_qry_ModelingDS = Query_Results.to_dataframe()
#View top few rows of result
df_qry_ModelingDS.head()

Unnamed: 0,Customer_PTY_Id,Customer_Surname,Customer_Given_Name,Customer_Email,Customer_Gender,Customer_DOB,Customer_POB,Customer_Street_Address,Customer_State_Address,Customer_Zip_Address,...,CUST_COUNTRY,CUST_CTRY_CD,CUST_PHONE,YTD_STATUS,YTD_MILES,YTD_SEGMENTS,ORI_AIRPORT,CUST_F_NAME,CUST_L_NAME,accepted_offer
0,28-5818843,Gebuhr,Isaiah,igebuhrlj@so-net.ne.jp,Male,1941-04-28,San Jose,70 2nd Park,CA,95173,...,Mexico,MX,155-451-3933,Platinum,5099270,31,CA-BC,Humfried,Minister,1
1,28-5818843,Gebuhr,Isaiah,igebuhrlj@so-net.ne.jp,Male,1941-04-28,San Jose,70 2nd Park,CA,95173,...,Mexico,MX,155-451-3933,Platinum,5099270,31,CA-BC,Humfried,Minister,1
2,28-5818843,Gebuhr,Isaiah,igebuhrlj@so-net.ne.jp,Male,1941-04-28,San Jose,70 2nd Park,CA,95173,...,Mexico,MX,155-451-3933,Platinum,5099270,31,CA-BC,Humfried,Minister,1
3,28-5818843,Gebuhr,Isaiah,igebuhrlj@so-net.ne.jp,Male,1941-04-28,San Jose,70 2nd Park,CA,95173,...,Mexico,MX,155-451-3933,Platinum,5099270,31,CA-BC,Humfried,Minister,1
4,28-5818843,Gebuhr,Isaiah,igebuhrlj@so-net.ne.jp,Male,1941-04-28,San Jose,70 2nd Park,CA,95173,...,Mexico,MX,155-451-3933,Platinum,5099270,31,CA-BC,Humfried,Minister,1


Split the modeling dataset into training, testing and validation datasets

In [14]:
# import some essential math libraries
import pandas as pd; import matplotlib.pyplot as plt; import numpy as np
import plotly.offline as py; from matplotlib import pyplot as plt
import IPython.display
%matplotlib inline
py.init_notebook_mode(connected=True)

# load scikit-learn modeling packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error
import seaborn as sb

# specify how much of the dataset to set aside for testing
test_size = 0.42
# specify a seed value so we can always generate the same split
random_state = 58

# Generate X_train, X_test, y_train, y_test, which we will need for modeling
X = df_qry_ModelingDS.drop(['accepted_offer'], axis=1)
y = df_qry_ModelingDS["accepted_offer"]
X_train, X_test, y_train, y_test =\
train_test_split(X, y, test_size = test_size, random_state = random_state)

# save the 3 split datasets locally
X_train.to_csv("traindataset.csv")
X_test.to_csv("testdataset.csv")
y_test.to_csv("validatedataset.csv")

In [15]:
# Capture model dataset insights
msg = f"We split the dataset in a training, testing and validation datasets. "\
      f"{test_size * 100}% of the data is set aside for testing.\n "\
      f"- Training dataset size: {X_train.shape[0]}\n "\
      f"- Testing dataset size: {X_test.shape[0]}\n "\
      f"- Validation dataset size: {y_test.shape[0]}\n"\
      f"Our seed to generate repeatable datasets is {random_state}"
iteration.step_generate_test_design = msg

Added Comment to Step: Generate Test Design

Link to Step: https://app.vectice.com/browse/iteration/ITR-2054



## Create our Linear Regression model

In [16]:
# create a linear regression model
model_linreg = LinearRegression()

X_train = X_train.apply(pd.to_numeric, errors='coerce')
y_train = y_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

X_train.fillna(0, inplace=True)
y_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

model_linreg.fit(X_train.values, y_train.values)

# evaluate, define and save the RMSE and MAE summary statistics
pred = model_linreg.predict(X_test.values)    
RMSE = np.sqrt(mean_squared_error(y_test.values, pred))
MAE = mean_absolute_error(y_test.values, pred)

# the metrics object holds our two key summary statistics
summary_stats = {"RMSE": RMSE, "MAE": MAE}

# finally, generate and save a pretty plot
plt.ioff()
plt.scatter(X_train.iloc[:,0].values, y_train ,color='g')
plt.plot(X_test, pred,color='k')

[<matplotlib.lines.Line2D at 0x7f9618ca5c40>,
 <matplotlib.lines.Line2D at 0x7f9618ca5be0>,
 <matplotlib.lines.Line2D at 0x7f9618ca5850>,
 <matplotlib.lines.Line2D at 0x7f9618ca5ac0>,
 <matplotlib.lines.Line2D at 0x7f9618ca5880>,
 <matplotlib.lines.Line2D at 0x7f9618ca5eb0>,
 <matplotlib.lines.Line2D at 0x7f9618ca5d90>,
 <matplotlib.lines.Line2D at 0x7f9618ca55e0>,
 <matplotlib.lines.Line2D at 0x7f9618ca5a30>,
 <matplotlib.lines.Line2D at 0x7f9618ca5670>,
 <matplotlib.lines.Line2D at 0x7f9618ca58b0>,
 <matplotlib.lines.Line2D at 0x7f962092d100>,
 <matplotlib.lines.Line2D at 0x7f95e97bed00>,
 <matplotlib.lines.Line2D at 0x7f95e97becd0>,
 <matplotlib.lines.Line2D at 0x7f95e97bed90>,
 <matplotlib.lines.Line2D at 0x7f95f90d9070>,
 <matplotlib.lines.Line2D at 0x7f96208d7fd0>,
 <matplotlib.lines.Line2D at 0x7f96208d7ca0>,
 <matplotlib.lines.Line2D at 0x7f95f90961f0>,
 <matplotlib.lines.Line2D at 0x7f963a078dc0>,
 <matplotlib.lines.Line2D at 0x7f963a078d60>,
 <matplotlib.lines.Line2D at 0x7f9

# Vectice Documentation Pipeline

In [17]:
from vectice import FileResource, Dataset, Model
iteration.step_select_modeling_techniques = "For this first iteration we are going to use a Linear Regression model to get a base model."

Added Comment to Step: Select Modeling Techniques

Link to Step: https://app.vectice.com/browse/iteration/ITR-2054



In [18]:
dataset = Dataset.modeling(
    name="Modeling dataset", 
    training_resource=FileResource(paths="traindataset.csv", dataframes=X_train), 
    testing_resource=FileResource(paths="testdataset.csv", dataframes=X_test)
    )
iteration.step_generate_test_design = dataset

File: traindataset.csv wrapped successfully.
File: testdataset.csv wrapped successfully.
New Version: 'Version 10' of Dataset: 'Modeling dataset' added to Step: Generate Test Design
Attachments: None
Link to Step: https://app.vectice.com/browse/iteration/ITR-2054



In [19]:
# Catalog the model

model = Model(
    name="AcceptancePredictor", 
    library= "scikit-learn", 
    technique="linear regression", 
    metrics=summary_stats, 
    attachments=["26_LiftChart_holdout.png","Confusion_Matrix.png"], 
    predictor=model_linreg, 
    derived_from=[dataset.latest_version_id]
    )
iteration.step_build_model = model
iteration.complete()

Model LinearRegression successfully attached to Model(name='AcceptancePredictor', version='Version 30').
New Version: 'Version 30' of Model: 'AcceptancePredictor' added to Step: Build Model
Attachments: 26_LiftChart_holdout.png, Confusion_Matrix.png
Link to Step: https://app.vectice.com/browse/iteration/ITR-2054

Iteration with index 33 completed.

For quick access to the Iteration in the Vectice web app, visit:
https://app.vectice.com/browse/iteration/ITR-2054
