In [190]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle


In [191]:
df_original = pd.read_csv("training_dataset_2023-09-24.csv")

In [192]:
df_original

Unnamed: 0,id,category,store,2023-07-30,2023-08-15,2023-08-28,2023-09-09,2023-09-24
0,70876,LAPTOP,Anhoch,10580.0,10580.0,10580.0,10580.0,10580.0
1,70877,LAPTOP,Anhoch,11980.0,11980.0,11980.0,11980.0,11980.0
2,70878,LAPTOP,Anhoch,11980.0,11980.0,11980.0,11980.0,11980.0
3,70879,LAPTOP,Anhoch,12580.0,12580.0,12580.0,12580.0,12580.0
4,70881,LAPTOP,Anhoch,12980.0,10480.0,10480.0,10480.0,10480.0
...,...,...,...,...,...,...,...,...
3528,75158,FREEZER,TehnoMarket,22999.0,22999.0,22999.0,22999.0,22999.0
3529,75159,FREEZER,TehnoMarket,23999.0,23999.0,23999.0,23999.0,23999.0
3530,75160,FREEZER,TehnoMarket,27999.0,27999.0,27999.0,27999.0,27999.0
3531,75161,FREEZER,TehnoMarket,28999.0,28999.0,28999.0,28999.0,28999.0


In [195]:
df=df_original.copy()

In [197]:
df.dropna(inplace=True)

In [199]:
df['2023-07-30']=df['2023-07-30'].astype(int)

In [200]:
df

Unnamed: 0,id,category,store,2023-07-30,2023-08-15,2023-08-28,2023-09-09,2023-09-24
0,70876,LAPTOP,Anhoch,10580,10580.0,10580.0,10580.0,10580.0
1,70877,LAPTOP,Anhoch,11980,11980.0,11980.0,11980.0,11980.0
2,70878,LAPTOP,Anhoch,11980,11980.0,11980.0,11980.0,11980.0
3,70879,LAPTOP,Anhoch,12580,12580.0,12580.0,12580.0,12580.0
4,70881,LAPTOP,Anhoch,12980,10480.0,10480.0,10480.0,10480.0
...,...,...,...,...,...,...,...,...
3528,75158,FREEZER,TehnoMarket,22999,22999.0,22999.0,22999.0,22999.0
3529,75159,FREEZER,TehnoMarket,23999,23999.0,23999.0,23999.0,23999.0
3530,75160,FREEZER,TehnoMarket,27999,27999.0,27999.0,27999.0,27999.0
3531,75161,FREEZER,TehnoMarket,28999,28999.0,28999.0,28999.0,28999.0


In [194]:
df.drop(columns=['id'],inplace=True)

In [93]:
category_encoder = OneHotEncoder(sparse_output=False)
store_encoder = OneHotEncoder(sparse_output=False)

category_encoder.fit(df[['category']])
store_encoder.fit(df[['store']])

# category_encoded = category_encoder.transform(df[['category']]).astype(int)
# store_encoded = store_encoder.transform(df[['store']]).astype(int)

In [90]:
with open('category_encoder.pkl', 'wb') as file:
    pickle.dump(category_encoder, file)
with open('store_encoder.pkl', 'wb') as file:
    pickle.dump(store_encoder, file)

In [94]:
def encode_dataset(df_original, number_of_prices=5):
    df=df_original.copy()
    df.drop(columns=['id'],inplace=True)
    category_encoded_df = pd.DataFrame(category_encoded, columns=category_encoder.get_feature_names_out(['category']))
    store_encoded_df = pd.DataFrame(store_encoded, columns=store_encoder.get_feature_names_out(['store']))
    df_encoded = pd.concat([df, category_encoded_df, store_encoded_df], axis=1)
    df_encoded = df_encoded.drop(['category', 'store'], axis=1)

    price_columns=df_encoded.columns[0:number_of_prices]
    for price in price_columns:
        df_encoded[price]=df_encoded[price].astype(int)
    new_price_columns=dict()
    for i, price in enumerate(price_columns):
        if i == number_of_prices-1:
            new_price_columns[price]=f"target"
        else:
            new_price_columns[price]=f"price{i+1}"

    df_encoded.rename(columns=new_price_columns, inplace=True)
    columns=df_encoded.columns
    desired_order = columns[number_of_prices:].tolist() +columns[0:number_of_prices-1].tolist() + [columns[number_of_prices-1]]
    df_encoded=df_encoded[desired_order]
    df_encoded['difference']=df_encoded['target'] - df_encoded[f'price{number_of_prices-1}']
    df_encoded['output'] = df_encoded['difference'] / df_encoded[f'price{number_of_prices - 1}']    
    df_encoded['output_label'] = df_encoded['output'].apply(lambda x: 'UP' if x >= 0.05 else 'DOWN' if x <= -0.05 else 'SAME')

    return df_encoded

In [48]:
def split_dataset(df_original):
    df=df_original.copy()
    train_data, test_validation_data = train_test_split(df, test_size=0.4, shuffle=True)
    test_data, validation_data = train_test_split(test_validation_data, test_size=0.5, shuffle=True)

    x=df_encoded.columns[0:-4]
    y=[df_encoded.columns[-4]]

    X_train = train_data[x]
    y_train = train_data[y]
    X_test = test_data[x]
    y_test = test_data[y]
    X_validation = validation_data[x]
    y_validation = validation_data[y]
    accuracy_test=validation_data[['output_label']]

    return train_data, test_data, validation_data, X_train, y_train, X_test, y_test, X_validation, y_validation, accuracy_test

In [95]:
df_encoded= encode_dataset(df_original)

In [96]:
df_encoded

Unnamed: 0,category_AC,category_CPU,category_FREEZER,category_FRIDGE,category_GPU,category_LAPTOP,category_PHONE,category_TV,store_Anhoch,store_DDStore,...,store_Setec,store_TehnoMarket,price1,price2,price3,price4,target,difference,output,output_label
0,0,0,0,0,0,1,0,0,1,0,...,0,0,10580,10580,10580,10580,10580,0,0.0,SAME
1,0,0,0,0,0,1,0,0,1,0,...,0,0,11980,11980,11980,11980,11980,0,0.0,SAME
2,0,0,0,0,0,1,0,0,1,0,...,0,0,11980,11980,11980,11980,11980,0,0.0,SAME
3,0,0,0,0,0,1,0,0,1,0,...,0,0,12580,12580,12580,12580,12580,0,0.0,SAME
4,0,0,0,0,0,1,0,0,1,0,...,0,0,12980,10480,10480,10480,10480,0,0.0,SAME
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3528,0,0,1,0,0,0,0,0,0,0,...,0,1,22999,22999,22999,22999,22999,0,0.0,SAME
3529,0,0,1,0,0,0,0,0,0,0,...,0,1,23999,23999,23999,23999,23999,0,0.0,SAME
3530,0,0,1,0,0,0,0,0,0,0,...,0,1,27999,27999,27999,27999,27999,0,0.0,SAME
3531,0,0,1,0,0,0,0,0,0,0,...,0,1,28999,28999,28999,28999,28999,0,0.0,SAME


In [51]:
train_data, test_data, validation_data, X_train, y_train, X_test, y_test, X_validation, y_validation, accuracy_test = split_dataset(df_encoded)

## Models

### Random Forest Tree Regressor

In [52]:
X_train_combined = pd.concat([X_train, X_validation], axis=0)
y_train_combined = np.concatenate([y_train, y_validation])

In [53]:
model = RandomForestRegressor()
model.fit(X_train_combined, y_train_combined)

  return fit_method(estimator, *args, **kwargs)


In [54]:
predictions = model.predict(X_test)

In [73]:
def process_prediction_output_pseudo(predictions, y_test, test_data, number_of_prices):
    predictions_df = pd.DataFrame(predictions, columns=['predicted_target'])
    predictions_df['predicted_target']=predictions_df['predicted_target'].astype(int)
    evaluation_validation=test_data[['target','output_label',f'price{number_of_prices-1}', 'output','difference']]
    evaluation_validation.reset_index(drop=True, inplace=True)
    predictions_df=pd.concat([predictions_df,evaluation_validation],axis=1)
    predictions_df['predicted_difference']=predictions_df['predicted_target'] - predictions_df[f'price{number_of_prices-1}']
    predictions_df['predicted_output'] = predictions_df['predicted_difference'] / predictions_df[f'price{number_of_prices - 1}']    
    predictions_df['predicted_output_label'] = predictions_df['predicted_output'].apply(lambda x: 'UP' if x >= 0.05 else 'DOWN' if x <= -0.05 else 'SAME')
    return predictions_df

In [74]:
predictions_df=process_prediction_output_pseudo(predictions, y_test, test_data, 5)

In [75]:
predictions_df

Unnamed: 0,predicted_target,target,output_label,price4,output,difference,predicted_difference,predicted_output,predicted_output_label
0,9184,8980,DOWN,9980,-0.10020,-1000,-796,-0.079760,DOWN
1,56989,55980,SAME,56980,-0.01755,-1000,9,0.000158,SAME
2,26049,25999,SAME,25999,0.00000,0,50,0.001923,SAME
3,35990,35980,SAME,35980,0.00000,0,10,0.000278,SAME
4,24034,23999,SAME,23999,0.00000,0,35,0.001458,SAME
...,...,...,...,...,...,...,...,...,...
702,9995,9999,SAME,9999,0.00000,0,-4,-0.000400,SAME
703,9095,9110,SAME,9110,0.00000,0,-15,-0.001647,SAME
704,21987,21999,SAME,21999,0.00000,0,-12,-0.000545,SAME
705,105068,104999,SAME,104999,0.00000,0,69,0.000657,SAME


In [77]:
all=predictions_df.shape[0]
correct=predictions_df[predictions_df['output_label'] == predictions_df['predicted_output_label']].shape[0]

In [80]:
correct/all*100

97.02970297029702

In [98]:
df_encoded

Unnamed: 0,category_AC,category_CPU,category_FREEZER,category_FRIDGE,category_GPU,category_LAPTOP,category_PHONE,category_TV,store_Anhoch,store_DDStore,...,store_Setec,store_TehnoMarket,price1,price2,price3,price4,target,difference,output,output_label
0,0,0,0,0,0,1,0,0,1,0,...,0,0,10580,10580,10580,10580,10580,0,0.0,SAME
1,0,0,0,0,0,1,0,0,1,0,...,0,0,11980,11980,11980,11980,11980,0,0.0,SAME
2,0,0,0,0,0,1,0,0,1,0,...,0,0,11980,11980,11980,11980,11980,0,0.0,SAME
3,0,0,0,0,0,1,0,0,1,0,...,0,0,12580,12580,12580,12580,12580,0,0.0,SAME
4,0,0,0,0,0,1,0,0,1,0,...,0,0,12980,10480,10480,10480,10480,0,0.0,SAME
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3528,0,0,1,0,0,0,0,0,0,0,...,0,1,22999,22999,22999,22999,22999,0,0.0,SAME
3529,0,0,1,0,0,0,0,0,0,0,...,0,1,23999,23999,23999,23999,23999,0,0.0,SAME
3530,0,0,1,0,0,0,0,0,0,0,...,0,1,27999,27999,27999,27999,27999,0,0.0,SAME
3531,0,0,1,0,0,0,0,0,0,0,...,0,1,28999,28999,28999,28999,28999,0,0.0,SAME


In [114]:
df_predictions = df_encoded.copy()

In [115]:
number_of_dates = 5

In [116]:
for i in range(1,number_of_dates-1):
    df_predictions[f'price{i}']=df_predictions[f'price{i+1}']
df_predictions[f'price{number_of_dates-1}']=df_predictions['target']
df_predictions.drop(columns=["target",'difference','output','output_label'], inplace = True)

In [117]:
df_predictions

Unnamed: 0,category_AC,category_CPU,category_FREEZER,category_FRIDGE,category_GPU,category_LAPTOP,category_PHONE,category_TV,store_Anhoch,store_DDStore,store_EKupi,store_Neptun,store_Setec,store_TehnoMarket,price1,price2,price3,price4
0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,10580,10580,10580,10580
1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,11980,11980,11980,11980
2,0,0,0,0,0,1,0,0,1,0,0,0,0,0,11980,11980,11980,11980
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,12580,12580,12580,12580
4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,10480,10480,10480,10480
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3528,0,0,1,0,0,0,0,0,0,0,0,0,0,1,22999,22999,22999,22999
3529,0,0,1,0,0,0,0,0,0,0,0,0,0,1,23999,23999,23999,23999
3530,0,0,1,0,0,0,0,0,0,0,0,0,0,1,27999,27999,27999,27999
3531,0,0,1,0,0,0,0,0,0,0,0,0,0,1,28999,28999,28999,28999


In [137]:
def _encode_dataset_for_making_predictions(df_original, number_of_prices):
    df = df_original.copy()
    category_encoded = category_encoder.transform(df[['category']]).astype(int)
    store_encoded = store_encoder.transform(df[['store']]).astype(int)
    category_encoded_df = pd.DataFrame(category_encoded, columns=category_encoder.get_feature_names_out(['category']))
    store_encoded_df = pd.DataFrame(store_encoded, columns=store_encoder.get_feature_names_out(['store']))
    df_encoded = pd.concat([df, category_encoded_df, store_encoded_df], axis=1)
    df_encoded = df_encoded.drop(['category', 'store'], axis=1)

    price_columns = df_encoded.columns[1:number_of_prices+1]
    for price in price_columns:
        df_encoded[price] = df_encoded[price].astype(int)
    new_price_columns = dict()
    for i, price in enumerate(price_columns):
        new_price_columns[price] = f"price{i + 1}"
            
    df_encoded.rename(columns=new_price_columns, inplace=True)
    columns = df_encoded.columns
    print(columns)
    desired_order = columns[number_of_prices+1:].tolist() + columns[1:number_of_prices +1].tolist() + [
        columns[0]]
    df_encoded = df_encoded[desired_order]

    return df_encoded

In [179]:
df_test_encoding= df_original.copy()
df_test_encoding.drop(columns=['2023-07-30'],inplace=True)
a=_encode_dataset_for_making_predictions(df_test_encoding,4)

Index(['id', 'price1', 'price2', 'price3', 'price4', 'category_AC',
       'category_CPU', 'category_FREEZER', 'category_FRIDGE', 'category_GPU',
       'category_LAPTOP', 'category_PHONE', 'category_TV', 'store_Anhoch',
       'store_DDStore', 'store_EKupi', 'store_Neptun', 'store_Setec',
       'store_TehnoMarket'],
      dtype='object')


In [169]:
a

Unnamed: 0,category_AC,category_CPU,category_FREEZER,category_FRIDGE,category_GPU,category_LAPTOP,category_PHONE,category_TV,store_Anhoch,store_DDStore,store_EKupi,store_Neptun,store_Setec,store_TehnoMarket,price1,price2,price3,price4,id
0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,10580,10580,10580,10580,65675
1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,11980,11980,11980,11980,65676
2,0,0,0,0,0,1,0,0,1,0,0,0,0,0,11980,11980,11980,11980,65677
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,12580,12580,12580,12580,65678
4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,10480,10480,10480,10480,65680
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3528,0,0,1,0,0,0,0,0,0,0,0,0,0,1,22999,22999,22999,22999,69957
3529,0,0,1,0,0,0,0,0,0,0,0,0,0,1,23999,23999,23999,23999,69958
3530,0,0,1,0,0,0,0,0,0,0,0,0,0,1,27999,27999,27999,27999,69959
3531,0,0,1,0,0,0,0,0,0,0,0,0,0,1,28999,28999,28999,28999,69960


In [181]:
a.iloc[:, :-1]

Unnamed: 0,category_AC,category_CPU,category_FREEZER,category_FRIDGE,category_GPU,category_LAPTOP,category_PHONE,category_TV,store_Anhoch,store_DDStore,store_EKupi,store_Neptun,store_Setec,store_TehnoMarket,price1,price2,price3,price4
0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,10580,10580,10580,10580
1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,11980,11980,11980,11980
2,0,0,0,0,0,1,0,0,1,0,0,0,0,0,11980,11980,11980,11980
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,12580,12580,12580,12580
4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,10480,10480,10480,10480
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3528,0,0,1,0,0,0,0,0,0,0,0,0,0,1,22999,22999,22999,22999
3529,0,0,1,0,0,0,0,0,0,0,0,0,0,1,23999,23999,23999,23999
3530,0,0,1,0,0,0,0,0,0,0,0,0,0,1,27999,27999,27999,27999
3531,0,0,1,0,0,0,0,0,0,0,0,0,0,1,28999,28999,28999,28999


In [182]:
b=model.predict(a.iloc[:, :-1])

In [187]:
number_of_prices=5
a['predicted_target']=b.astype(int)


In [188]:
a['predicted_difference']=a['predicted_target'] - a[f'price{number_of_prices-1}']
a['predicted_output'] = a['predicted_difference'] / a[f'price{number_of_prices - 1}'] 
a['predicted_output_label'] = a['predicted_output'].apply(lambda x: 'UP' if x >= 0.05 else 'DOWN' if x <= -0.05 else 'SAME')

In [189]:
a[['id','price4','predicted_target','predicted_difference', 'predicted_output', 'predicted_output_label']]

Unnamed: 0,id,price4,predicted_target,predicted_difference,predicted_output,predicted_output_label
0,65675,10580,10595,15,0.001418,SAME
1,65676,11980,11802,-178,-0.014858,SAME
2,65677,11980,11802,-178,-0.014858,SAME
3,65678,12580,12577,-3,-0.000238,SAME
4,65680,10480,10464,-16,-0.001527,SAME
...,...,...,...,...,...,...
3528,69957,22999,22999,0,0.000000,SAME
3529,69958,23999,23999,0,0.000000,SAME
3530,69959,27999,27999,0,0.000000,SAME
3531,69960,28999,28999,0,0.000000,SAME


In [177]:
a.iloc[:,10:]

Unnamed: 0,store_EKupi,store_Neptun,store_Setec,store_TehnoMarket,price1,price2,price3,price4,id,predicted_target,predicted_difference,predicted_output,predicted_output_label
0,0,0,0,0,10580,10580,10580,10580,65675,10595.33,15.33,0.001449,SAME
1,0,0,0,0,11980,11980,11980,11980,65676,11802.42,-177.58,-0.014823,SAME
2,0,0,0,0,11980,11980,11980,11980,65677,11802.42,-177.58,-0.014823,SAME
3,0,0,0,0,12580,12580,12580,12580,65678,12577.65,-2.35,-0.000187,SAME
4,0,0,0,0,10480,10480,10480,10480,65680,10464.63,-15.37,-0.001467,SAME
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3528,0,0,0,1,22999,22999,22999,22999,69957,22999.00,0.00,0.000000,SAME
3529,0,0,0,1,23999,23999,23999,23999,69958,23999.00,0.00,0.000000,SAME
3530,0,0,0,1,27999,27999,27999,27999,69959,27999.00,0.00,0.000000,SAME
3531,0,0,0,1,28999,28999,28999,28999,69960,28999.00,0.00,0.000000,SAME
