In [2]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
pd.set_option('display.max_rows',None)

In [3]:
def loadCleanData(path:str):
    # read data
    data = pd.read_csv(path)
    
    # drop columns
    data = data[['Market', 'TrainNumber', 'DepartureDateTime', 'ArrivalDateTime','Origin', 'Destination', 'Revenue', 'Bookings','CommercialClass']]
    
    # change date format
    data['DepartureDateTime'] = pd.to_datetime(data['DepartureDateTime'] )
    data['ArrivalDateTime'] = pd.to_datetime(data['ArrivalDateTime'] )

    # index data
    cleanData = data.groupby(by=['Market', 'TrainNumber', 'DepartureDateTime', 'CommercialClass'])[['Revenue','Bookings']].sum()
    cleanData = cleanData.reset_index()

    return cleanData, data


def rule_80_20(df:pd.DataFrame, attr:str, index_1=0, index_2=0, value_index_1=0, value_index_2=0, percent = 80.0):

    if index_1 == 0 and index_2 == 0:
        index_1 = 'Year'
        index_2 = 'MonthYear'

    marketFor_ = pd.pivot_table(df, index=[index_1, index_2, 'Market'], values=attr, aggfunc='sum').sort_values(by=[index_1, index_2, attr], ascending=[True, True, False])
    marketFor = marketFor_.div(marketFor_.groupby([index_1, index_2]).transform('sum')) * 100

    sumAccumulation = marketFor.groupby([index_1, index_2]).cumsum()
    filterIndex = sumAccumulation[attr] <= percent
    indexFilterIndex = filterIndex[filterIndex].index

    marketFor_80 = marketFor.loc[indexFilterIndex].reset_index()
    marketFor_80.columns = [index_1, index_2, 'Market',	'PercentRevenue']
    numberMarketFor_80 = marketFor_80.reset_index().groupby([index_1, index_2])['Market'].nunique().reset_index()
    numberMarketFor_80.columns = [index_1, index_2,	'NumMarket']

    if value_index_1 != 0:
        marketFor_80 = marketFor_80[marketFor_80[index_1]==value_index_1]
        numberMarketFor_80 = numberMarketFor_80[numberMarketFor_80[index_1]==value_index_1]

    if value_index_2 != 0:
        marketFor_80 = marketFor_80[marketFor_80[index_2]==value_index_2]
        numberMarketFor_80 = numberMarketFor_80[numberMarketFor_80[index_2]==value_index_2]

    if value_index_1 != 0 and value_index_2 != 0:
        return marketFor_80, numberMarketFor_80['NumMarket'][1]
    else:
        return marketFor_80, numberMarketFor_80
    

def addTimeFeatures(df: pd.DataFrame, attr: str):
    df['DepatureDate'] = df[attr].dt.date
    df['DepatureTime'] = df[attr].dt.time
    df['HourDay'] = df[attr].dt.hour
    df['DayWeek'] = df[attr].dt.day_of_week
    df['WeekYear'] = df[attr].dt.isocalendar().week
    df['MonthYear'] = df[attr].dt.month
    df['Year'] = df[attr].dt.year
    
    return df


def listHistogram(df: pd.DataFrame, attr, markets:list, index='DepartureDateTime', n_cols:int=4, xmin:int=0, xmax:int=60, ymin:int=0, ymax:int=5000):

    df = df.groupby(by=['Market', index])[attr].sum().reset_index()

    # Calculate number of rows
    if len(markets) % n_cols == 0 :
        n_rows = int(len(markets)/n_cols)
    else:
        n_rows = int(len(markets)/n_cols) + 1 

    # Create chart
    fig = go.Figure()
    fig = make_subplots(rows=n_rows, cols=n_cols, subplot_titles=[maket for maket in markets])
    
    # Add subplot
    contF = 1
    contC = 1
    for market in markets:
        if contC <= n_cols:
                fig.add_trace(go.Histogram(x = df[df['Market']==market][attr], 
                                            name='Market '+market, 
                                            marker=dict(line=dict(color='black', width=1))), 
                                            row=contF, 
                                            col=contC)
                
                # Adjust x-axis limits to add zoom
                fig.update_xaxes(range=[xmin, xmax], row=contF, col=contC)
                fig.update_yaxes(range=[ymin, ymax], row=contF, col=contC)
                
                contC += 1
        else:
                contC = 1
                contF += 1
                fig.add_trace(go.Histogram(x = df[df['Market']==market][attr], 
                                            name='Market '+market, 
                                            marker=dict(line=dict(color='black', width=1))), 
                                            row=contF, 
                                            col=contC)
                
                # Adjust x-axis limits to add zoom
                fig.update_xaxes(range=[xmin, xmax], row=contF, col=contC)
                fig.update_yaxes(range=[ymin, ymax], row=contF, col=contC)

                contC += 1

    # Update chart layout
    fig.update_layout(title={
                            'text': attr + ' vs Markets ' + 'Histogram', 
                            'x': 0.5, 
                            'xanchor':'center',
                            'font':{'color':'blue'}
                            },
                    # xaxis_title='Bookings',
                    # yaxis_title='Count'
                    )

    # Show the graph
    fig.show()


def comparisonTwoHistogram(df:pd.DataFrame, markets:list, index, attr):
    
      df = df.groupby(by=[index, 'Market'])[attr].sum().reset_index()
      # Agregar ambos histogramas

      fig = go.Figure()

      fig.add_trace(go.Histogram(x=df[df['Market'] == markets[0]][attr], name='Market '+markets[0], marker=dict(color='rgba(255, 140, 0, 0.6)',line=dict(color='black', width=1))))
      fig.add_trace(go.Histogram(x=df[df['Market'] == markets[1]][attr], name='Market '+markets[1], marker=dict(color='rgba(0, 134, 139, 0.3)',line=dict(color='black', width=1))))


      # Actualizar el diseño del gráfico para que las barras se superpongan
      fig.update_layout(title={
                              'text':'Histogram Comparison: ['+markets[0]+'] vs ['+ markets[1]+']', 
                              'x': 0.5, 
                              'xanchor':'center',
                              'font':{'color':'rgba(0, 134, 139, 1)'}
                              },
                        xaxis_title='Bookings',
                        yaxis_title='Count',
                        bargap=0,  # Espacio entre las barras
                        barmode='overlay'  # Superponer las barras
                  )

      fig.show()


def temporalSeries(df: pd.DataFrame, markets:list, index, attr):

    fig = go.Figure()

    for market in markets:
        yy =  df[df['Market']==market].groupby(by=['Year',index])[attr].sum().reset_index()
        fig.add_trace(go.Scatter(x = yy['Year'].astype(str) + yy[index].astype(str),y = yy[attr],name=attr+' for Market '+market))

    fig.update_layout(
        title={
            'text': attr + " by "+ index+"-Year",
            'x': 0.5,  # Ajuste para centrar horizontalmente el título
            'xanchor': 'center',  # Ancla del título en el centro
            'font': {'color': 'blue'}
        },
        xaxis = dict(
            title = index+" of the Year", 
            tickmode = 'linear',
            # tick0 = y1['WeekYear'].min(),
            dtick = 1
        ),
        yaxis=dict(
            title=attr  # Nombre del eje y
        )
        )

    fig.show()


def hotMap(df:pd.DataFrame, index, colum, attr, market=''):
    if market == '':
        df = pd.pivot_table(df, values=[attr], index=index, columns=colum, aggfunc={attr:'sum'})
    else:
        df = pd.pivot_table(df[df['Market']==market], values=[attr], index=index, columns=colum, aggfunc={attr:'sum'})

    # Crear el heatmap
    heatmap = go.Heatmap(
        z=df.values,
        x=df.columns.levels[1],
        y=df.index,
        hoverongaps=False,
        colorscale='Portland',  # Cambiar la escala de color (ejemplo: 'Viridis')
    )

    # Crear la figura y agregar el heatmap
    fig = go.Figure(data=heatmap)

    # Configurar el diseño del gráfico
    fig.update_layout(
        title='Heatmap of '+ attr +' by '+ index + ' and '+colum,
        xaxis=dict(
            title=colum,
            tickmode='array',
            tickvals=list(range(24)),  # Especificar todos los valores de x
            ticktext=[str(i) for i in range(24)]  # Etiquetas personalizadas para cada valor de x
        ),
        yaxis=dict(
            title=index
        )
    )

    # Mostrar el gráfico
    fig.show()


In [4]:
cleanData, data = loadCleanData('/home/wilmer/Documentos/Codes/WORKSHOP/data/anonymizedHistoricalData.csv')

In [5]:
# Adding temporality
cleanData = addTimeFeatures(cleanData, 'DepartureDateTime')

# Number of Markets
numTotalMarket = cleanData['Market'].nunique()
numMarketMonth = cleanData.groupby(by=['Year', 'MonthYear'])['Market'].nunique().reset_index()

# Rule 80-20
marketForDemand, numberMarketForDemand = rule_80_20(df = cleanData, 
                                                    attr = 'Bookings', 
                                                    # value_index_1 = 2023,
                                                    # value_index_2 = 1
                                                    )

marketForRevenue, numberMarketForRevenue = rule_80_20(df = cleanData, 
                                                    attr = 'Revenue', 
                                                    # value_index_1 = 2023,
                                                    # value_index_2 = 1
                                                    )

print('Totales de markets: {:,} \n'.format(numTotalMarket))

# For a  year and  a month
# print('Para el año y mes dado:')
# print('{:,} Markets suplen el 80% de la demanda y '.format(numberMarketForDemand))
# print('{:,} Markets generan el 80% del Revenue'.format(numberMarketForRevenue))

# For every Index
indexData = pd.concat([numMarketMonth,numberMarketForDemand['NumMarket'], numberMarketForRevenue['NumMarket']], axis=1)
indexData.columns = ['Year',  'MonthYear',  'TotalNumMarket',  'NumMarketDemand_80-20', 'NumMarketRevenue_80-20']
indexData

Totales de markets: 2,162 



Unnamed: 0,Year,MonthYear,TotalNumMarket,NumMarketDemand_80-20,NumMarketRevenue_80-20
0,2022,12,1907,144,133
1,2023,1,1873,142,127
2,2023,2,1816,139,122
3,2023,3,1849,145,138
4,2023,4,1878,138,128
5,2023,5,1884,132,120
6,2023,6,1907,141,125
7,2023,7,1931,148,137
8,2023,8,1949,155,147
9,2023,9,1945,142,119


In [12]:
listHistogram(cleanData, 'Bookings', ['S46-S31', 'S46-S41'], ymax=300, xmax=400, n_cols=2)

In [13]:
comparisonTwoHistogram(cleanData, ['S46-S31','S46-S41'], 'WeekYear', 'Bookings')

In [16]:
temporalSeries(cleanData, ['S46-S31','S31-S46'], 'DayWeek', 'Bookings')

In [18]:
hotMap(cleanData, 'DayWeek', 'HourDay', 'Bookings')