In [11]:
import pandas as pd
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
import datetime as dt
import numpy as np
import plotly.express as px
from pmdarima import auto_arima

In [4]:
def add_features(data):
    dates = pd.to_datetime(data["Date"])
    data["Months"] = (dates.dt.month - 6)/12
    data["Days"] = (dates.dt.isocalendar().day - 15)/30
    data["Week"] = (dates.dt.isocalendar().week - 26)/52
    data["Day of week"] = (dates.dt.dayofweek - 3.5)/7
    # Number of days after 30 December 2020
    data["Index"] = (dates - dt.datetime(2020, 12, 30)).dt.days
    return data

def preprocess(initial_data, holiday, level):
    data = initial_data.groupby(['Date'])\
                       .sum()\
                       .dropna()\
                       .reset_index()

    data['Date'] = pd.to_datetime(data['Date'])
    final_data = data[['Date','Total']]
    final_data = add_features(final_data)

    if holiday is not None:
        final_data['Total'] *= np.mean(holiday)/level

    date = final_data['Date'].max()
    return final_data, date

def train_arima(train_data):
    model = auto_arima(train_data['Total'],
                       start_p=1, start_q=1,
                       max_p=5, max_q=5,
                       start_P=0, seasonal=False,
                       d=1, D=1, trace=True,
                       error_action='ignore',  
                       suppress_warnings=True)
    model.fit(train_data['Total'])
    return model

def forecast(model, n_periods):
    predictions = model.predict(n_periods=n_periods)
    return np.array(predictions)

def train_xgboost(train_data):    
    y = train_data['Total']
    X = train_data.drop(['Total','Date'], axis=1)
    
    model = GradientBoostingRegressor()
    model.fit(X,y)
    return model

def forecast_xgboost(model, date, n_periods):
    dates = pd.to_datetime([date + dt.timedelta(days=i)
                            for i in range(n_periods)])
    X = add_features(pd.DataFrame({"Date":dates}))
    X.drop('Date', axis=1, inplace=True)
    predictions = model.predict(X)
    return predictions

def concat(final_data, predictions_arima, predictions_xgboost):
    date = final_data['Date'].max()

    def  _convert_predictions(final_data, predictions, date, label='Predictions'):
        dates = pd.to_datetime([date + dt.timedelta(days=i)
                                for i in range(len(predictions))])
        final_data['Date'] = pd.to_datetime(final_data['Date'])
        final_data = final_data[['Date','Total']]
        predictions = pd.concat([pd.Series(dates, name="Date"),
                                 pd.Series(predictions, name=label)], axis=1)
        return final_data.merge(predictions, on="Date", how="outer")

    result_arima = _convert_predictions(final_data, predictions_arima, date, label='ARIMA')
    result_xgboost = _convert_predictions(final_data, predictions_xgboost, date, label='Xgboost')
    return result_arima.merge(result_xgboost, on=["Date", 'Total'], how="outer").sort_values(by='Date')


In [8]:
data = pd.read_csv('modified_supermarkt_sales_plus.csv')
data['Date'] = pd.to_datetime(data['Date'], format="%Y-%m-%d")
data['Month_Year'] = data['Date'].dt.to_period('M').dt.to_timestamp()

In [9]:
data

Unnamed: 0,Invoice_ID,Branch,City,Customer_type,Gender,Product_line,Unit_price,Quantity,Tax_5%,Total,Date,Time,Payment,cogs,gross_margin_percentage,gross_income,Rating,Latitude,Longitude,Month_Year
0,MCK-27-2THE,C,Luang Prabang,Normal,Male,Sports and travel,92.097991,9,41.444096,854.186630,2021-01-01,15:52,Cash,828.881919,4.761905,41.444096,4.532518,19.889271,102.133453,2021-01-01
1,GUK-86-3CQT,E,Chiang Mai,Normal,Male,Fashion accessories,74.612296,7,26.114304,552.450518,2021-01-01,10:28,Credit card,522.286073,4.761905,26.114304,4.693977,18.796143,98.979263,2021-01-01
2,EDS-29-SN03,A,Bangkok,Member,Female,Fashion accessories,28.097037,9,12.643667,266.949898,2021-01-01,14:15,Cash,252.873332,4.761905,12.643667,4.226045,13.736717,100.523186,2021-01-01
3,271-77-8740,C,Naypyitaw,Member,Female,Sports and travel,29.220000,6,8.766000,181.686822,2021-01-01,11:40,Ewallet,175.320000,4.761905,8.766000,5.000000,19.763300,96.078500,2021-01-01
4,770-42-8960,B,Mandalay,Normal,Male,Food and beverages,21.120000,8,8.448000,199.433878,2021-01-01,19:31,Cash,168.960000,4.761905,8.448000,6.300000,21.958800,96.089100,2021-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,YNW-00-F6PZ,E,Luang Prabang,Member,Female,Health and beauty,47.612592,5,11.903148,3275.393714,2021-12-31,13:22,Credit card,238.062958,4.761905,11.903148,8.740449,19.889271,102.133453,2021-12-01
5996,JMQ-95-64P5,C,Bangkok,Member,Male,Food and beverages,11.180501,9,5.031225,3082.655654,2021-12-31,12:52,Ewallet,100.624507,4.761905,5.031225,5.883139,13.736717,100.523186,2021-12-01
5997,EXJ-21-05TP,F,Vientiane,Member,Female,Home and lifestyle,78.538739,8,31.415496,3647.631424,2021-12-31,18:54,Ewallet,628.309915,4.761905,31.415496,5.256854,17.974855,102.630867,2021-12-01
5998,VMQ-54-QKLW,D,Chiang Mai,Member,Female,Electronic accessories,24.406809,3,3.661021,3081.850540,2021-12-31,12:55,Credit card,73.220428,4.761905,3.661021,1.196487,18.796143,98.979263,2021-12-01


In [13]:
def create_bar_figure(data, group_by):
    sales_over_time = data.groupby(group_by)['Total'].sum().reset_index()
    fig = px.bar(sales_over_time, x=group_by, y='Total', title=f'Sales Trends Over {group_by}', color='Total')
    return fig

def create_perc_fig(df, group_column):
    # Group, sum, and convert to percentage
    df = df.groupby(['Month_Year', group_column])['Total'].sum().unstack(fill_value=0)
    df = df.div(df.sum(axis=1), axis=0).reset_index().melt(id_vars='Month_Year', var_name=group_column, value_name='Percentage')
    df['Percentage'] = (df.loc[:, 'Percentage'].round(3) * 100)
    # Create and return the plot
    fig = px.bar(df, x='Month_Year', y='Percentage', color=group_column, title=f"Evolution of Sales by {group_column} over Time", labels={'Percentage': '% of Total'}, text_auto=True)
    return fig

fig_product_line = create_perc_fig(data, 'Product_line')
fig_city = create_perc_fig(data, 'City')
fig_gender = create_perc_fig(data, 'Gender')
fig_customer_type = create_perc_fig(data, 'Customer_type')


def on_change(state, var_name, var_value):
    if var_name in ['city', 'customer_type', 'gender']:
        data = state.data.loc[
            state.data["City"].isin(state.city)
            & state.data["Customer_type"].isin(state.customer_type)
            & state.data["Gender"].isin(state.gender), :
        ]

        state.fig_product_line = create_perc_fig(data, 'Product_line')
        state.fig_city = create_perc_fig(data, 'City')
        state.fig_gender = create_perc_fig(data, 'Gender')
        state.fig_customer_type = create_perc_fig(data, 'Customer_type')


def plot_total_sales_distribution(data):
    data['date'] = pd.to_datetime(data['Date'])
    data['year'] = data['date'].dt.year

    # Filter necessary columns for the plot
    filtered_data = data[['year', 'Total']]

    # Create the histogram plot
    fig = px.histogram(filtered_data, x='Total', color='year', barmode='overlay',
                       marginal='rug', histnorm='probability density',
                       labels={'Total': 'Total Sales', 'year': 'Year'})

    # Update layout for better visualization
    fig.update_layout(title='Distribution of Total Sales by Year',
                      xaxis_title='Total Sales',
                      yaxis_title='Frequency',
                      bargap=0.1)

    return fig

def plot_results_and_errors(initial_data, comparison_data, result):
    # Tracer les résultats avec Plotly Express
    fig = px.line(initial_data, x='Date', y='Total', title='Prévisions des ventes totales')
    fig.add_scatter(x=result['Date'], y=result['ARIMA'], mode='lines', name='Prévisions ARIMA', line=dict(color='red'))
    fig.add_scatter(x=result['Date'], y=result['Xgboost'], mode='lines', name='Prévisions XGBoost', line=dict(color='green'))

    # Comparer avec les données réelles
    comparison_data['Date'] = pd.to_datetime(comparison_data['Date'])
    comparison = pd.merge(result, comparison_data[['Date', 'Total']], on='Date', how='inner', suffixes=('', '_true'))

    # Calculer l'erreur de prédiction
    comparison['error_arima'] = comparison['Total'] - comparison['ARIMA']
    comparison['error_xgboost'] = comparison['Total'] - comparison['Xgboost']

    # Tracer les erreurs de prédiction avec Plotly Express
    fig_error = px.line(comparison, x='Date', y='error_arima', title='Erreur de prédiction au fil du temps')
    fig_error.add_scatter(x=comparison['Date'], y=comparison['error_xgboost'], mode='lines', name='Erreur XGBoost', line=dict(color='green'))
    return comparison
    #return fig, fig_error

# Charger les données
initial_data = pd.read_csv('modified_supermarkt_sales_plus.csv')
comparison_data = pd.read_csv('modified_supermarkt_sales_plus_four_years.csv')

# Préprocesser les données
final_data, last_date = preprocess(initial_data, holiday=None, level=None)

# Entraîner les modèles
arima_model = train_arima(final_data)
xgboost_model = train_xgboost(final_data)

# Faire des prévisions
predictions_arima = forecast(arima_model, n_periods=3*365)
predictions_xgboost = forecast_xgboost(xgboost_model, last_date, n_periods= 3*365)

# Concaténer les résultats
result = concat(final_data, predictions_arima, predictions_xgboost)

# Tracer les résultats et les erreurs de prédiction
fig, fig_error = plot_results_and_errors(initial_data, comparison_data, result)


customer_type = ["Normal", "Member"]
gender = ["Male", "Female"]
city = ["Bangkok", "Chiang Mai", "Vientiane", "Luang Prabang"]





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/

Performing stepwise search to minimize aic
 ARIMA(1,1,1)(0,0,0)[0] intercept   : AIC=7650.871, Time=0.27 sec
 ARIMA(0,1,0)(0,0,0)[0] intercept   : AIC=7849.566, Time=0.02 sec
 ARIMA(1,1,0)(0,0,0)[0] intercept   : AIC=7757.774, Time=0.03 sec
 ARIMA(0,1,1)(0,0,0)[0] intercept   : AIC=7648.912, Time=0.11 sec
 ARIMA(0,1,0)(0,0,0)[0]             : AIC=7847.576, Time=0.01 sec
 ARIMA(0,1,2)(0,0,0)[0] intercept   : AIC=7653.687, Time=0.07 sec
 ARIMA(1,1,2)(0,0,0)[0] intercept   : AIC=inf, Time=0.46 sec
 ARIMA(0,1,1)(0,0,0)[0]             : AIC=7648.755, Time=0.04 sec
 ARIMA(1,1,1)(0,0,0)[0]             : AIC=7650.725, Time=0.05 sec
 ARIMA(0,1,2)(0,0,0)[0]             : AIC=7648.443, Time=0.06 sec
 ARIMA(1,1,2)(0,0,0)[0]             : AIC=7649.582, Time=0.11 sec
 ARIMA(0,1,3)(0,0,0)[0]             : AIC=7648.455, Time=0.08 sec
 ARIMA(1,1,3)(0,0,0)[0]             : AIC=7647.813, Time=0.29 sec
 ARIMA(2,1,3)(0,0,0)[0]             : AIC=inf, Time=0.81 sec
 ARIMA(1,1,4)(0,0,0)[0]             : AIC=7

In [14]:
predictions_arima

array([37824.42290595, 38065.42205324, 37713.00218137, ...,
       36547.5191076 , 36547.5191076 , 36547.5191076 ])

In [15]:
predictions_xgboost

array([29808.29008206, 28585.05007866, 31378.62892694, ...,
       29808.29008206, 29808.29008206, 32770.42778028])

In [16]:
plot_results_and_errors(initial_data, comparison_data, result)

(Figure({
     'data': [{'hovertemplate': 'Date=%{x}<br>Total=%{y}<extra></extra>',
               'legendgroup': '',
               'line': {'color': '#636efa', 'dash': 'solid'},
               'marker': {'symbol': 'circle'},
               'mode': 'lines',
               'name': '',
               'showlegend': False,
               'type': 'scattergl',
               'x': array([datetime.datetime(2021, 1, 1, 0, 0),
                           datetime.datetime(2021, 1, 1, 0, 0),
                           datetime.datetime(2021, 1, 1, 0, 0), ...,
                           datetime.datetime(2024, 9, 2, 0, 0),
                           datetime.datetime(2024, 5, 18, 0, 0),
                           datetime.datetime(2024, 2, 21, 0, 0)], dtype=object),
               'xaxis': 'x',
               'y': array([ 854.18663024,  552.45051805,  266.94989848, ..., 3310.82662253,
                           3126.28046797, 5530.57545581]),
               'yaxis': 'y'},
              {'line': {'

In [18]:
comparison = plot_results_and_errors(initial_data, comparison_data, result)
comparison['error_arima']

TypeError: tuple indices must be integers or slices, not str