In [2]:
from os.path import exists
import pandas as pd

In [3]:
df_path = 'train.csv' if exists('train.csv') else 'https://github.com/upgini/upgini/blob/main/notebooks/train.csv.zip'
df=pd.read_csv(df_path)
df = df.sample(n=19_000,random_state=0)
df['store']=df['store'].astype(str)
df['item']=df['item'].astype(str)
df['date']=pd.to_datetime(df['date'])

df.sort_values('date',inplace=True)
df.reset_index(inplace=True,drop=True)
df.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,7,5,5
1,2013-01-01,4,9,19
2,2013-01-01,1,33,37
3,2013-01-01,3,41,14
4,2013-01-01,5,24,26


In [4]:
train = df[df['date'] < '2017-01-01']
test = df[df['date'] >= '2017-01-01']

In [5]:
train_features = train.drop(columns=['sales'])
train_target = train['sales']
test_features = test.drop(columns=['sales'])
test_target = test['sales']

Feature Engineering

In [6]:
#enriching features
from upgini import FeaturesEnricher, SearchKey
from upgini.metadata import CVType

enricher = FeaturesEnricher(
    search_keys={
        'date':SearchKey.DATE,
    },
    cv = CVType.time_series
)
enricher.fit(train_features,
            train_target,
            eval_set=[(test_features,test_target)])

        


Detected task type: ModelTaskType.REGRESSION


Column name,Status,Description
date,All valid,All values in this column are good to go
target,All valid,All values in this column are good to go


Running search request with search_id=8b034663-ce4e-493a-a947-5d27fa07f91c
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
Done

[92m[1m
22 relevant feature(s) found with the search keys: ['date'][0m


Unnamed: 0,feature_name,shap_value,coverage %,type
0,item,0.48876,100.0,CHARACTER
1,store,0.172255,100.0,CHARACTER
2,f_weather_pca_0_94efd18d,0.105247,100.0,NUMERIC
3,f_week_sin1_a71d22f6,0.047612,100.0,NUMERIC
4,f_week_cos1_d3d56d7f,0.029368,100.0,NUMERIC
5,f_year_cos1_cd165f8c,0.023262,100.0,NUMERIC
6,f_dow_jones_89547e1d,0.008943,100.0,NUMERIC
7,f_silver_d4264cf9,0.004856,100.0,NUMERIC
8,f_silver_7d_to_1y_82c4ef86,0.004065,100.0,NUMERIC
9,f_finance_umap_3_424d51ca,0.003683,100.0,NUMERIC


In [9]:
#Defining model and evaluation _MetricsDict
from catboost import CatBoostRegressor
from catboost.utils import eval_metric

model = CatBoostRegressor(verbose=False,allow_writing_files=False,random_state=0)

enricher.calculate_metrics(
    train_features,train_target,
    eval_set = [(test_features, test_target)],
    estimator = model,
    scoring = 'mean_absolute_percentage_error'
)

Start calculating metrics
Done


Unnamed: 0,match_rate,baseline mean_absolute_percentage_error,enriched mean_absolute_percentage_error,uplift
,,,,
train,100.0,0.255844,0.167287,0.088557
eval 1,100.0,0.243877,0.130172,0.113705


In [10]:
#creating enriched training and testing dataset
enriched_train_features = enricher.transform(train_features,keep_input=True)
enriched_test_features = enricher.transform(test_features,keep_input=True)
enriched_train_features.head()

90.39637% of the rows are fully duplicated


Column name,Status,Description
date,All valid,All values in this column are good to go


Running search request with search_id=520c849f-5635-468f-a5cc-5a995d1234b4
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
Done

Executing transform step
Done
90.36176% of the rows are fully duplicated


Column name,Status,Description
date,All valid,All values in this column are good to go


Running search request with search_id=2fa02951-b0d7-4a9d-88ee-efc3333ab95b
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
Done

Executing transform step
Done


Unnamed: 0,date,store,item,f_weather_pca_0_94efd18d,f_week_sin1_a71d22f6,f_week_cos1_d3d56d7f,f_year_cos1_cd165f8c,f_dow_jones_89547e1d,f_silver_d4264cf9,f_silver_7d_to_1y_82c4ef86,...,f_italy_game_cnt_9cfcfe65,f_silver_7d_to_7d_1y_shift_ccbd2abf,f_dow_jones_7d_to_7d_1y_shift_9628c89b,f_cbpol_umap_1_34dc2149,f_cbpol_pca_3_2e94b9bf,f_weather_umap_30_98fa4f7d,f_nasdaq_d309709a,f_payment_fraud_score_b6a9c12e,f_transaction_fraud_union_score_c1a2808b,f_weather_umap_11_c213a9d7
0,2013-01-01,7,5,28.661328,0.781831,0.62349,0.98522,13104.139648,30.173,0.963323,...,0,1.072025,1.065267,4.815701,-0.323471,3.547175,3019.51001,0.10595,0.057741,7.594507
1,2013-01-01,4,9,28.661328,0.781831,0.62349,0.98522,13104.139648,30.173,0.963323,...,0,1.072025,1.065267,4.815701,-0.323471,3.547175,3019.51001,0.10595,0.057741,7.594507
2,2013-01-01,1,33,28.661328,0.781831,0.62349,0.98522,13104.139648,30.173,0.963323,...,0,1.072025,1.065267,4.815701,-0.323471,3.547175,3019.51001,0.10595,0.057741,7.594507
3,2013-01-01,3,41,28.661328,0.781831,0.62349,0.98522,13104.139648,30.173,0.963323,...,0,1.072025,1.065267,4.815701,-0.323471,3.547175,3019.51001,0.10595,0.057741,7.594507
4,2013-01-01,5,24,28.661328,0.781831,0.62349,0.98522,13104.139648,30.173,0.963323,...,0,1.072025,1.065267,4.815701,-0.323471,3.547175,3019.51001,0.10595,0.057741,7.594507


In [13]:
#Model prediction for given dataset
model.fit(train_features,train_target)
pred = model.predict(test_features)
eval_metric(test_target.values,pred,'SMAPE')


[37.65141857448004]

In [14]:
#Model prediction for enriched dataset
model.fit(enriched_train_features,train_target)
enriched_pred = model.predict(enriched_test_features)
eval_metric(test_target.values,enriched_pred,'SMAPE')

[14.6691661535934]