## Pay transparency data science round trip

In [1]:
from datetime import date, datetime
import numpy as np
from visier_api_data_out import QueryFilterDTO
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer

from visier_api import VisierApi
from model import Model

In [2]:
# file name must be the analytic object name, and in CSV format
FILE_PATH = './data/Employee.csv'
# This is the end time for the query
QUERY_END_TIME = date(2022, 5, 1)

visier_api: VisierApi = VisierApi()

# Prepare for the input data

To explore what data is available in a tenant, we could list properties with the `DataModelApi`.

In [3]:
schema = visier_api.list_data_model_properties(analytic_object='Employee').properties
list(filter(lambda x: x.id == "Employee.EmployeeID", schema))

[PropertyDTO(data_type='String', description='The unique identifier for the employee.', display_name='Employee ID', id='Employee.EmployeeID', parameters=None, primitive_data_type='String', tags=None)]

Then we can compose a list query execution DTO for prediction analysis.

We query the attributes that we are interested as feature attributes:
- isFemale, Employee.Internal_Job_Experience, Highest_Education_Level_Achieved, Internal_Experience, External_Experience

and the training target attribute:
- Market_Direct_Compensation

In [4]:
# Fetch data from the public API 
execution = visier_api.compose_list_query_execution_dto(
    columns=[
        {"displayName": "EmployeeID", "attribute": "EmployeeID"},
        {"displayName": "IsWoman", "attribute": "isFemale"},
        {"displayName": "TimeInPosition", "attribute": "Employee.Internal_Job_Experience"},
        {"displayName": "Education", "attribute": "Highest_Education_Level_Achieved"},
        {"displayName": "OtherTenure", "attribute": "Internal_Experience"},
        {"displayName": "ExternalExperience", "attribute": "External_Experience"},
        {"displayName": "DirectCompensation", "attribute": "Market_Direct_Compensation"},
    ],
    end_time=datetime.strftime(QUERY_END_TIME, "%Y-%m-%d"),
    filters = [QueryFilterDTO(formula='Currency_Code="USD"')]
)

df = visier_api.download_data(execution)
df.head(5)

Unnamed: 0,EmployeeID,IsWoman,TimeInPosition,Education,OtherTenure,ExternalExperience,DirectCompensation
0,Employee-29191448,True,0,,0,93,53506.95
1,Employee-121619,False,51,Bachelor,36,242,192529.79
2,Employee-13488561,False,13,,12,78,162770.92
3,Employee-25314844,False,5,,0,119,93397.76
4,Employee-22613614,False,10,,0,132,64003.77


Slice the dataframe for prediction model input(X) and output(y).

In [5]:
PREDICTION_INPUT_COLUMNS = frozenset([
    'IsWoman', 
    'Education', 
    'TimeInPosition', 
    'OtherTenure', 
    'ExternalExperience'
])

X, y = df[PREDICTION_INPUT_COLUMNS], df['DirectCompensation']

X.shape, y.shape

((1860, 5), (1860,))

# Regression model
Define regression model model

In [6]:
## define feature processing pipeline
features = FeatureUnion(
    [
        ("IsWoman", ColumnTransformer([
            ("one_hot",OneHotEncoder(
                handle_unknown="ignore", drop="first", dtype=np.float32
            ), ["IsWoman"])])
        ),
        ("Education", ColumnTransformer([
            ("one_hot",OneHotEncoder(
                handle_unknown="ignore", drop="first", dtype=np.float32
            ), ["Education"])])
        ),
        ("TimeInPosition", ColumnTransformer([
            ("scale",StandardScaler(),["TimeInPosition"])
        ])),
        ("OtherTenure", ColumnTransformer([
            ("scale",StandardScaler(),["OtherTenure"])
        ])),
        ("ExternalExperience", ColumnTransformer([
            ("scale",StandardScaler(),["ExternalExperience"])
        ])),
    ]
)

model = Model(features)

# Model validation

Validate model and measure model performance

In [7]:
model.evaluate(X,y)

[CV] START .....................................................................
[CV] END  explained_variance: (train=0.282, test=0.337) neg_mean_absolute_error: (train=-36866.416, test=-38208.443) neg_mean_squared_error: (train=-5488562886.118, test=-6180789174.319) r2: (train=0.268, test=0.324) total time=   0.0s
[CV] START .....................................................................
[CV] END  explained_variance: (train=0.341, test=0.253) neg_mean_absolute_error: (train=-36357.616, test=-38828.778) neg_mean_squared_error: (train=-6194645638.773, test=-4367417226.424) r2: (train=0.329, test=0.229) total time=   0.0s
[CV] START .....................................................................
[CV] END  explained_variance: (train=0.324, test=0.292) neg_mean_absolute_error: (train=-38070.202, test=-35562.473) neg_mean_squared_error: (train=-5123962993.144, test=-6691398035.895) r2: (train=0.308, test=0.282) total time=   0.0s


[Parallel(n_jobs=1)]: Done   1 tasks      | elapsed:    0.0s


{'fit_time': np.float64(0.009179910024007162),
 'score_time': np.float64(0.004883368810017903),
 'test_r2': np.float64(0.2784647984753941),
 'train_r2': np.float64(0.3017514582634106),
 'test_neg_mean_squared_error': np.float64(-5746534812.212598),
 'train_neg_mean_squared_error': np.float64(-5602390506.011748),
 'test_neg_mean_absolute_error': np.float64(-37533.23137003673),
 'train_neg_mean_absolute_error': np.float64(-37098.078018933076),
 'test_explained_variance': np.float64(0.29403552177286335),
 'train_explained_variance': np.float64(0.3156941644581215)}

# Model training & inference
Fit model and view coefficients

In [8]:
model.fit(X,y)
model["regressor"].regressor_.coef_.astype(float)

array([-0.01533487,  0.00240708,  0.15020224,  0.12551257,  0.2097119 ,
        0.19642017,  0.07297258,  0.11740238,  0.18639227])

Use model for inference and evaluate the result, and assigned it as a new column in df

In [9]:
df['Predicted'] = model.predict(X)

Save the model, so we can persist the coeffiencient and reproduce the results with the same input

In [10]:
model.save()

# To load the model, simply:
# model2 = Model('data/model.2024-09-01.pkl')
#
# And to reproduce the result:
# model2.predict(X)

# Save Output

Get the project schema from the tenant

In [11]:
(schema_columns := [
    s.name
    for s in visier_api.list_project_schema()
    if s.is_mandatory or "Market_Direct_Compensation" in s.name
])

['EmployeeID', 'Market_Direct_Compensation', 'EventDate']

Message the prediction data so it matches the tenant's schema requirements.

In [None]:
# Convert the dataframe to the expected format
CURRENT_TIME = datetime.now()
output = df[['EmployeeID', 'Predicted']]
output['EventDate'] = datetime.strftime(datetime(2022,7,1), "%Y-%m-%d")
# output['regressionRanDate'] = CURRENT_TIME
output.rename(columns={
    'EmployeeID': 'EmployeeID',
    'EventDate': 'EventDate',
    'Predicted': 'Market_Direct_Compensation'
}, inplace=True)
assert set(output.columns) == set(schema_columns)

# Save the output
output.to_csv(FILE_PATH, index=False)

# Upload the result to the application

In [12]:
tx_id = visier_api.upload_data(FILE_PATH, analytic_object_name='Employee')

Transaction status: SUCCEEDED, message: ReceivingDataJob scheduled
