# Dataset creation

In [1]:
from statsmodels.tsa.stattools import grangercausalitytests
from statsmodels.tsa.stattools import coint

from sklearn.preprocessing import LabelEncoder

import pandas as pd
import numpy as np

import plotly.graph_objects as go
import matplotlib.pyplot as plt
import plotly.express as px

import numbers
import statistics as st

df = pd.read_csv('final_csv_task1.csv', usecols=['comp_date',
                                             'product_id',
                                             'seller_id',
                                             'min',
                                             'price',
                                             'max' ,
                                             'difference'])
enc = LabelEncoder()
df['enc_comp_date'] = enc.fit_transform(df['comp_date'])
df["comp_date"] =  pd.to_datetime(df["comp_date"], format = "%Y/%m/%d")
df["quarter"] = df.comp_date.dt.quarter

In [2]:
product_id_input = int(input("What's the product id you want to investigate? (Please provide eg. 110064) "))
quarter_input = int(input("Which quarter you want to investigate? (Please provide 1, 2, 3 or 4) "))

What's the product id you want to investigate? (Please provide eg. 110064) 110064
Which quarter you want to investigate? (Please provide 1, 2, 3 or 4) 1


In [3]:
def pivot(product_id, quarter):
    df_1 = df[(df.product_id == product_id) & (df.quarter == quarter)]
    table = pd.pivot_table(df_1, values='price', index=['comp_date'], columns=['seller_id'])
    table.fillna(method='ffill', inplace=True)
    return table

dfp = pivot(product_id_input, quarter_input)

# Pricing Automation Detection

### Identification of a constant deviation from another seller

In [4]:
dict_dataframe = dict()

# For each seller i
for col_1 in dfp.columns:
    
    # Create a dataframe 
    dataframe_temp = pd.DataFrame()
    
    # For each seller j
    for col_2 in dfp.columns:
        
        # If it is different from seller i
        if col_1 != col_2:
            
            # Compute the difference of their prices
            series = pd.Series(dfp[col_1] - dfp[col_2], name = col_2)
            
           # Store the differences in a dataframe 
            dataframe_temp = pd.concat([dataframe_temp, series], axis = 1)
            
            # Creation of a dictionary that has sellers i as key and the dataframe as values
            dict_dataframe[col_1] = dataframe_temp

dizionario = dict()

# For each seller i
for i in dict_dataframe:
    
    # Create a temporary dictionary
    diz_temp = dict()
    
    # For each seller j
    for j in dict_dataframe[i]:
        
        # Set sellers j as keys and normalized standard deviation of differences as values
        diz_temp[j] = dict_dataframe[i][j].std() / dfp[i].mean()
    
    # Set sellers j as keys and the temporary dictionaries as values
    dizionario[i] = diz_temp

del(dict_dataframe)

# Create a dataframe that has sellers present in the dictionary as columns and rows indices
d = pd.DataFrame(0, index=list(dizionario.keys()), columns=list(dizionario.keys()))

# Fill dataframe cells with standard deviations
for columns in d.columns:
    for x in dizionario:
        if columns != x:
            for y in dizionario[x]:
                d.loc[y, x] = dizionario[x][y]

dizionario_a_b = dict()

# Substitute standard deviations with A/B lables
for i in range(3, 7, 1):
    d_copy = d.copy(deep = True)
    for col in d.columns:
        d_copy[col] = np.where(d_copy[col] > i/100, 'B', 'A')
    dizionario_a_b[i] = d_copy

dizionario_a_b

# INTERPRETATION: the seller corresponding to the column index sets a price that deviates by a fixed percentage/value
# with respect to the seller corresponding to the row index if their intersection is labled with "A".

{3:     23  24  26  41  48  180 407 490
 23    A   B   B   A   B   B   B   A
 24    B   A   B   B   B   B   B   B
 26    B   B   A   B   B   B   B   B
 41    A   B   B   A   B   A   B   A
 48    B   B   B   B   A   B   B   B
 180   B   B   B   A   B   A   B   A
 407   B   B   B   B   B   B   A   A
 490   A   B   B   A   B   A   A   A,
 4:     23  24  26  41  48  180 407 490
 23    A   B   B   A   A   A   A   A
 24    B   A   B   B   B   B   B   B
 26    B   B   A   B   A   B   B   B
 41    A   B   B   A   B   A   A   A
 48    A   B   A   B   A   B   B   B
 180   A   B   B   A   B   A   A   A
 407   A   B   B   A   B   A   A   A
 490   A   B   B   A   B   A   A   A,
 5:     23  24  26  41  48  180 407 490
 23    A   A   A   A   A   A   A   A
 24    A   A   B   B   B   B   B   B
 26    A   B   A   B   A   B   B   B
 41    A   B   B   A   A   A   A   A
 48    A   B   A   A   A   B   B   A
 180   A   B   B   A   B   A   A   A
 407   A   B   B   A   A   A   A   A
 490   A   B   B   A   A   

### Identification of a constant time lag

In [5]:
# Pre-processing
dfp.reset_index('comp_date', drop = True, inplace = True)
dfp.fillna("null", inplace = True)

In [6]:
# Creation of a dictionary which stores, for each sellers, the days in which they changed the price 
dizionario = dict()

for sell in dfp:
    
    # Each seller is associated with a list
    lista_seller = []
    
    # Iterate over the dataset's lenght
    for i in range(1, len(dfp)):
        
        # If the price changed compared to the previous day and if its value is not null:
        if dfp.loc[i, sell] != dfp.loc[i-1, sell] and (dfp.loc[i, sell] and dfp.loc[i-1, sell]) != 'null':
            
            # Append the day the price changed to the list associated to the seller
            lista_seller.append(i)
            
    # Set sellers as dictionary keys and lists of days as dictionary values       
    dizionario[sell] = lista_seller

In [7]:
dict_all = dict()

# For each seller (i)
for i in dizionario:
    
    # If it changed the price at least once, create a dictionary
    if len(dizionario[i]) != 0:
        dict_diff = dict()
        
        # For each seller (j)
        for j in dizionario: 
            
            # If it is different from the first one
            if j != i:
                
                # And if it changed the price at least once, create a list 
                if len(dizionario[j]) != 0:
                    lista_diff = []
                    
                    # For each day in which seller i changed its price
                    for elem_1 in range(len(dizionario[i])):
                        
                        # Create a temporary list that only contains the days when seller j 
                        # changed price that are earlier than the considered day of seller i 
                        list_temp = [item for item in dizionario[j] if item <= dizionario[i][elem_1]]
                        
                        # If there are earlier days in which seller j changed its price
                        if len(list_temp) != 0:
                            
                            # Compute the difference between the day in which seller i changed its price and 
                            # the most recent day in which seller j changed its price and append it to lista_diff
                            diff = dizionario[i][elem_1] - max(list_temp)
                            lista_diff.append(diff)
                            
                        # If there are no earlier days in which seller j changed its price,
                        # append 'no recent date' to the list
                        else: lista_diff.append('no recent date')
                        
                        # Set sellers as dictionary keys and time lags as dictionary values
                        dict_diff[j] = lista_diff
        
        # Creation of a nested dictionary where sellers are keys and the dictionaries created above are values 
        dict_all[i] = dict_diff

        
        
# Creation of a dataframe in which sellers present in the dictionary are set as columns and rows index
df = pd.DataFrame(0, index=list(dict_all.keys()), columns=list(dict_all.keys()))

# Each cell is filled with the standard deviation of the list containing 
# the time lag between the price change of two sellers.
# The idea is that if the time interval is constant over time, its standard deviation should be low.

for col in df.columns:
    for key1,v1 in dict_all.items():
        if col != key1:
            for key2,v2 in v1.items():
                st_dv = st.stdev([x for x in v2 if isinstance(x, numbers.Number)])
                
                # If time lags' standard deviation is less or equal to 1, the cell is filled with lable "A"
                if st_dv <= 1:
                    st_dv = 'A'
                    
                    # Otherwhise, it is filled with lable "B"
                else:
                    st_dv = 'B'
                df.loc[key1, key2] = st_dv
                
# INTERPRETATION: the seller corresponding to the column index sets a new price after always the same time lag 
# with respect to the seller corresponding to the row index if their intersection is labled with "A".

In [8]:
df

Unnamed: 0,24,26,41,48,180,407
24,0,B,B,B,B,B
26,B,0,B,B,B,B
41,B,B,0,B,B,B
48,B,A,B,0,B,B
180,B,B,B,B,0,B
407,B,B,B,B,B,0
