In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

from superwise import Superwise
from superwise.models.task import Task
from superwise.models.version import Version
from superwise.resources.superwise_enums import DataTypesRoles


### Init superwise client

In [3]:
sw = Superwise()

### Create Model in superwise

In [5]:
diamond_task = Task(
    name="Diamond Model",
    description="Regression model which predict the diamond price"
)

diamond_task = sw.task.create(diamond_task)
print(f"New task Created - {diamond_task.id}")

1


### Load data

In [6]:
diamonds = pd.read_csv('https://www.openml.org/data/get_csv/21792853/dataset')
diamonds.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


### Split Train-Test

In [7]:
X = diamonds.drop(columns="price")
y = diamonds["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z
19497,1.21,Ideal,H,VVS2,61.3,57.0,6.92,6.87,4.23
31229,0.31,Ideal,E,VS2,62.0,56.0,4.38,4.36,2.71
22311,1.21,Ideal,E,VS1,62.4,57.0,6.75,6.83,4.24
278,0.81,Ideal,F,SI2,62.6,55.0,5.92,5.96,3.72
6646,0.79,Ideal,I,VVS2,61.7,56.0,5.94,5.95,3.67


### Pre-processing

In [8]:
categorical_cols = ['cut','clarity','color']

preprocessor = ColumnTransformer(
    transformers=[
        ('categorical',  OneHotEncoder(), categorical_cols)
    ], remainder='passthrough')

### Train your model

In [9]:
diamond_price_model=LinearRegression()

my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', diamond_price_model)
])

my_pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('categorical',
                                                  OneHotEncoder(),
                                                  ['cut', 'clarity',
                                                   'color'])])),
                ('model', LinearRegression())])

### Predict

In [10]:
y_pred_train =  my_pipeline.predict(X_train)

### Create Version to monitor in Superwise

- Prepare Baseline Dataset

In [11]:
baseline_data = X_train.assign(prediction=y_pred_train,ts=pd.Timestamp.now(),price=y_train)
baseline_data["prediction"] = baseline_data["prediction"].astype(float)
baseline_data = baseline_data.reset_index().copy().rename(columns={"index": "id"})
baseline_data.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,prediction,ts,price
0,19497,1.21,Ideal,H,VVS2,61.3,57.0,6.92,6.87,4.23,8060.0,2021-12-04 18:27:20.213421,8131
1,31229,0.31,Ideal,E,VS2,62.0,56.0,4.38,4.36,2.71,613.0,2021-12-04 18:27:20.213421,756
2,22311,1.21,Ideal,E,VS1,62.4,57.0,6.75,6.83,4.24,8568.0,2021-12-04 18:27:20.213421,10351
3,278,0.81,Ideal,F,SI2,62.6,55.0,5.92,5.96,3.72,3037.5,2021-12-04 18:27:20.213421,2795
4,6646,0.79,Ideal,I,VVS2,61.7,56.0,5.94,5.95,3.67,3866.5,2021-12-04 18:27:20.213421,4092


- Infer schema

In [None]:
entities_collection = client.data_entity.summarise(
    data=baseline_data,
    specific_roles = {
      'id': DataTypesRoles.ID,
      'ts': DataTypesRoles.TIMESTAMP,
      'prediction': DataTypesRoles.PREDICTION_PROBABILITY,
      'price': DataTypesRoles.LABEL
    }
)

- Create Version

In [None]:
new_version = Version(
    task_id=diamond_task,
    name="1.0.0",
    data_entities=entities_collection,
)

new_version = client.version.create(new_version)
sw.version.activate(new_version.id)

### Predict production data

- Predict

In [12]:
y_test_pred= my_pipeline.predict(X_test)

- Log predictions in Superwise

In [13]:
ongoing_prediction = X_test.assign(prediction=y_test_pred,ts=pd.Timestamp.now())
ongoing_prediction["prediction"] = ongoing_prediction["prediction"].astype(float)
ongoing_prediction = ongoing_prediction.reset_index().copy().rename(columns={"index": "id"})
ongoing_prediction.head()

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,prediction,ts
0,1388,0.24,Ideal,G,VVS1,62.1,56.0,3.97,4.0,2.47,718.0,2021-12-04 18:27:34.536187
1,50052,0.58,'Very Good',F,VVS2,60.0,57.0,5.44,5.42,3.26,3192.0,2021-12-04 18:27:34.536187
2,41645,0.4,Ideal,E,VVS2,62.1,55.0,4.76,4.74,2.95,1951.5,2021-12-04 18:27:34.536187
3,42377,0.43,Premium,E,VVS2,60.8,57.0,4.92,4.89,2.98,2083.0,2021-12-04 18:27:34.536187
4,17244,1.55,Ideal,E,SI2,62.3,55.0,7.44,7.37,4.61,9876.0,2021-12-04 18:27:34.536187


In [None]:
transaction_id = sw.transaction.log_batch(
    task_id=diamond_task.id, 
    version_id=new_version, 
    records=ongoing_prediction.to_dict()
)

print(transaction_id)

- Log ground truth

In [14]:
ongoing_label = y_test.reset_index().copy().rename(columns={"index": "record_id"})
ongoing_label.head()

Unnamed: 0,record_id,price
0,1388,559
1,50052,2201
2,41645,1238
3,42377,1304
4,17244,6901


In [None]:
transaction_id = sw.transaction.log_batch(
    task_id=diamond_task.id, 
    records=ongoing_label.to_dict()
)

print(transaction_id)