In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from sklearn.preprocessing import QuantileTransformer
from genetic import GeneticPrograming 
import copy
import matplotlib.pyplot as plt

In [2]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

In [3]:
train_df.head()

Unnamed: 0,row_id,date,country,store,product,num_sold
0,0,2017-01-01,Belgium,KaggleMart,Kaggle Advanced Techniques,663
1,1,2017-01-01,Belgium,KaggleMart,Kaggle Getting Started,615
2,2,2017-01-01,Belgium,KaggleMart,Kaggle Recipe Book,480
3,3,2017-01-01,Belgium,KaggleMart,Kaggle for Kids: One Smart Goose,710
4,4,2017-01-01,Belgium,KaggleRama,Kaggle Advanced Techniques,240


In [4]:
# plot data
if(False):
    sns.lineplot(data=train_df, x="date", y="num_sold", hue="country")

In [5]:
# there is a tendency in the plot to accomplish for this all data will be 'normalized'

In [6]:
train_df['date'] = pd.to_datetime(train_df['date'], format='%Y-%m-%d')
train_df['year'] = train_df['date'].dt.year
train_df['month'] = train_df['date'].dt.month
train_df['day'] = train_df['date'].dt.day
train_df['day_of_week'] = train_df['date'].dt.day_of_week
train_df['day_of_year'] = train_df['date'].dt.day_of_year

In [7]:
train_df.head()

Unnamed: 0,row_id,date,country,store,product,num_sold,year,month,day,day_of_week,day_of_year
0,0,2017-01-01,Belgium,KaggleMart,Kaggle Advanced Techniques,663,2017,1,1,6,1
1,1,2017-01-01,Belgium,KaggleMart,Kaggle Getting Started,615,2017,1,1,6,1
2,2,2017-01-01,Belgium,KaggleMart,Kaggle Recipe Book,480,2017,1,1,6,1
3,3,2017-01-01,Belgium,KaggleMart,Kaggle for Kids: One Smart Goose,710,2017,1,1,6,1
4,4,2017-01-01,Belgium,KaggleRama,Kaggle Advanced Techniques,240,2017,1,1,6,1


In [8]:
qtO = QuantileTransformer(n_quantiles=100, random_state=0,output_distribution='normal')
qtO.fit(train_df.num_sold.loc[(train_df.year == 2020) & (train_df.store == 'KaggleMart') ].values.reshape(-1, 1))
        
qtO2 = QuantileTransformer(n_quantiles=100, random_state=0,output_distribution='normal')
qtO2.fit(train_df.num_sold.loc[(train_df.year == 2020) & (train_df.store == 'KaggleRama') ].values.reshape(-1, 1))

for country in train_df.country.unique():
    
    qt = QuantileTransformer(n_quantiles=100, random_state=0,output_distribution='normal')
    y_mod = qt.fit_transform(train_df.num_sold.loc[ (train_df.year < 2020) & (train_df.country == country)& (train_df.store == 'KaggleMart')].values.reshape(-1, 1))
    train_df.loc[(train_df.year < 2020) & (train_df.country == country) & (train_df.store == 'KaggleMart'),'num_sold'] = qtO.inverse_transform(y_mod)
    
    qt = QuantileTransformer(n_quantiles=100, random_state=0,output_distribution='normal')
    y_mod = qt.fit_transform(train_df.num_sold.loc[ (train_df.year < 2020) & (train_df.country == country)& (train_df.store == 'KaggleRama')].values.reshape(-1, 1))
    train_df.loc[(train_df.year < 2020) & (train_df.country == country) & (train_df.store == 'KaggleRama'),'num_sold'] = qtO2.inverse_transform(y_mod)

In [9]:
# plot data
if(False):
    sns.lineplot(data=train_df, x="date", y="num_sold", hue="country")

# boosting machine with GP

In [10]:
def smape(y,y_hat):
    num = np.abs(y_hat-y)
    den = (np.abs(y)+np.abs(y_hat))/2
    return( (np.sum(num/den)/len(y))*100 ) 

## generating input vars

In [None]:
nomes =['day_of_year','day_of_week','day','month','store','product','country']

train_df['day_of_year'] = (train_df['day_of_year'])/366
train_df['day_of_week'] = train_df['day_of_week']/6
train_df['day'] = train_df['day']/31
train_df['month'] = train_df['month']/12

col = 'store'
train_df[col] = train_df[col].astype("category").cat.codes
col = 'product'
train_df[col] = (train_df[col].astype("category").cat.codes)/5
col = 'country'
train_df[col] = train_df[col].astype("category").cat.codes/6


train_data = train_df.loc[train_df.year <= 2019]
test_data = train_df.loc[train_df.year == 2020]

mse_model = 1000
mse_model_val = 1000

#GP boosting
LR =1 # learning rate

mm = np.mean(train_data.num_sold.values)

yt = (train_data.num_sold.values - mm)
mmin = np.min(yt)
yt = yt-np.min(yt)
mmax = np.max(yt)
yt = yt/mmax
yt = np.log(yt+1)

orig_yt = copy.copy(yt)

yv = (test_data.num_sold.values - mm)
yv = (yv-mmin)/mmax

yv = yv=np.log(yv+1)
orig_yv = copy.copy(yv)

#np.heaviside,np.ldexp,np.power,np.divide
func = [np.add, np.subtract, np.multiply,np.sin, np.hypot, np.arctan2, np.logaddexp,np.minimum,np.maximum]

# create input vector
v = []
v_validation = []
# % validation

for var_nome in nomes:

    v.append(train_data [var_nome].values)   
    v_validation.append(test_data[var_nome].values)   
    
    
c=[-2,-0.5,-0.1,0.1,0.5,2,np.pi]
maxDeep = 10    

y_hat_reg = 0
y_hat_val = 0

for n in range(50):
    #j = np.arange(len(nomes),dtype=int)
    #np.random.shuffle(j)
    #print(n)
    ###p = 0.05 # number of variables in the model
    #p = 1    
    #v_buff = []
    #for v_ind in j[0:int((len(nomes)+1)*p)]:
    #    v_buff.append(v[v_ind])
        
    gclass = GeneticPrograming(2, maxDeep,v,func,c,yt)
    gclass.run(50,300)

    # plot best solution
    best = np.argsort(gclass.solEval)[0]
    
    y_hat = gclass.evaluateTree(gclass.candidates[best])    
    y_val = gclass.evaluateTreeInput(gclass.candidates[best],v_validation)

    # check if the model improved the solution
    
    mse_boost = smape( orig_yt, y_hat_reg+y_hat*LR)
    mse_boost_val = smape(orig_yv, y_hat_val+y_val*LR)    
    
    
    if (mse_model > mse_boost) & (mse_model_val > mse_boost_val):
        
        print('The model improved update. ','Iteration :', n)
            
        mse_model = copy.copy(mse_boost)
        yt = yt-y_hat*LR   # new error      
        y_hat_reg = y_hat_reg+y_hat*LR # new prev
        print('MAPE boost: ',mse_boost )
        
        mse_model_val = copy.copy(mse_boost_val)
        y_hat_val = y_hat_val+y_val*LR
        print('MAPE boost val: ',mse_boost_val )
        
    else:
        print('The model did not improve does nothing. ','Iteration :', n)
        print( mse_boost,'-', mse_model)
        print( mse_boost_val,'>', mse_model_val)
    
    ##LR = LR*0.99# update learning rate

plt.plot(train_data.num_sold.values )
plt.ylabel('target')
plt.plot( ((np.exp(y_hat_reg)-1)*mmax + mmin)+ mm,'--')
plt.show()

plt.plot(test_data.num_sold.values)
plt.ylabel('target')
plt.plot( ((np.exp(y_hat_val)-1)*mmax+mmin) + mm,'--')
plt.show()

Generation 0
Generation 100
Generation 200
The model improved update.  Iteration : 0
MAPE boost:  32.59542905746964
MAPE boost val:  32.581474278779254
Generation 0
Generation 100
Generation 200
The model improved update.  Iteration : 1
MAPE boost:  31.7763335804461
MAPE boost val:  31.163712967528795
Generation 0
Generation 100
Generation 200
The model improved update.  Iteration : 2
MAPE boost:  25.313530281881853
MAPE boost val:  25.534012238963687
Generation 0
Generation 100
Generation 200
The model improved update.  Iteration : 3
MAPE boost:  23.615025772505703
MAPE boost val:  23.796443312555084
Generation 0
Generation 100
Generation 200
The model improved update.  Iteration : 4
MAPE boost:  22.512531188656933
MAPE boost val:  22.933409489240265
Generation 0
Generation 100
Generation 200
The model improved update.  Iteration : 5
MAPE boost:  21.50774640151471
MAPE boost val:  22.100247371887438
Generation 0
Generation 100
Generation 200
The model improved update.  Iteration : 6
M

In [None]:
smape( test_data.num_sold.values, (((np.exp(y_hat_val)-1)*mmax+mmin) + mm)*1.03)

In [None]:
#11.573780118140295