# Dataset creation

In [12]:
from statsmodels.tsa.stattools import grangercausalitytests
from statsmodels.tsa.stattools import coint

from sklearn.preprocessing import LabelEncoder

import pandas as pd
import numpy as np

import plotly.graph_objects as go
import matplotlib.pyplot as plt
import plotly.express as px

import numbers
import statistics as st

df = pd.read_csv('final_csv_task1.csv', usecols=['comp_date',
                                             'product_id',
                                             'seller_id',
                                             'min',
                                             'price',
                                             'max' ,
                                             'difference'])
enc = LabelEncoder()
df['enc_comp_date'] = enc.fit_transform(df['comp_date'])
df["comp_date"] =  pd.to_datetime(df["comp_date"], format = "%Y/%m/%d")
df["quarter"] = df.comp_date.dt.quarter

In [15]:
product_id_input = int(input("What's the product id you want to investigate? (Please provide eg. 110064) "))
quarter_input = int(input("Which quarter you want to investigate? (Please provide 1, 2, 3 or 4) "))

What's the product id you want to investigate? (Please provide eg. 110064) 110064
Which quarter you want to investigate? (Please provide 1, 2, 3 or 4) 1


In [16]:
def pivot(product_id, quarter):
    df_1 = df[(df.product_id == product_id) & (df.quarter == quarter)]
    table = pd.pivot_table(df_1, values='price', index=['comp_date'], columns=['seller_id'])
    table.fillna(method='ffill', inplace=True)
    return table

dfp = pivot(product_id_input, quarter_input)

# Pricing Automation Detection

In [17]:
# Pre-processing
dfp.reset_index('comp_date', drop = True, inplace = True)
dfp.fillna("null", inplace = True)

In [18]:
# Creation of a dictionary which stores, for each sellers, the days in which they changed the price 
dizionario = dict()

for sell in dfp:
    
    # Each seller is associated with a list
    lista_seller = []
    
    # Iterate over the dataset's lenght
    for i in range(1, len(dfp)):
        
        # If the price changed compared to the previous day and if its value is not null:
        if dfp.loc[i, sell] != dfp.loc[i-1, sell] and (dfp.loc[i, sell] and dfp.loc[i-1, sell]) != 'null':
            
            # Append the day the price changed to the list associated to the seller
            lista_seller.append(i)
            
    # Set sellers as dictionary keys and lists of days as dictionary values       
    dizionario[sell] = lista_seller

In [10]:
dict_all = dict()

# For each seller (i)
for i in dizionario:
    
    # If it changed the price at least once, create a dictionary
    if len(dizionario[i]) != 0:
        dict_diff = dict()
        
        # For each seller (j)
        for j in dizionario: 
            
            # If it is different from the first one
            if j != i:
                
                # And if it changed the price at least once, create a list 
                if len(dizionario[j]) != 0:
                    lista_diff = []
                    
                    # For each day in which seller i changed its price
                    for elem_1 in range(len(dizionario[i])):
                        
                        # Create a temporary list that only contains the days when seller j 
                        # changed price that are earlier than the considered day of seller i 
                        list_temp = [item for item in dizionario[j] if item <= dizionario[i][elem_1]]
                        
                        # If there are earlier days in which seller j changed its price
                        if len(list_temp) != 0:
                            
                            # Compute the difference between the day in which seller i changed its price and 
                            # the most recent day in which seller j changed its price and append it to lista_diff
                            diff = dizionario[i][elem_1] - max(list_temp)
                            lista_diff.append(diff)
                            
                        # If there are no earlier days in which seller j changed its price,
                        # append 'no recent date' to the list
                        else: lista_diff.append('no recent date')
                        
                        # Set sellers as dictionary keys and time lags as dictionary values
                        dict_diff[j] = lista_diff
        
        # Creation of a nested dictionary where sellers are keys and the dictionaries created above are values 
        dict_all[i] = dict_diff

        
        
# Creation of a dataframe in which sellers present in the dictionary are set as columns and rows index
df = pd.DataFrame(0, index=list(dict_all.keys()), columns=list(dict_all.keys()))

# Each cell is filled with the standard deviation of the list containing 
# the time lag between the price change of two sellers.
# The idea is that if the time interval is constant over time, its standard deviation should be low.

for col in df.columns:
    for key1,v1 in dict_all.items():
        if col != key1:
            for key2,v2 in v1.items():
                st_dv = st.stdev([x for x in v2 if isinstance(x, numbers.Number)])
                
                # If time lags' standard deviation is less or equal to 1, the cell is filled with lable "A"
                if st_dv <= 1:
                    st_dv = 'A'
                    
                    # Otherwhise, it is filled with lable "B"
                else:
                    st_dv = 'B'
                df.loc[key1, key2] = st_dv
                
# INTERPRETATION: the seller corresponding to the column index sets a new price after always the same time lag 
# with respect to the seller corresponding to the row index if their intersection is labled with "A".