# TASK 1 - ALKEMY PROJECT

In [205]:
#Importing libraries
from sklearn.preprocessing import LabelEncoder
from statsmodels.tsa.stattools import coint
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import pandas as pd

In [206]:
#Preparing the data
df = pd.read_csv('final_csv_task1.csv', usecols=['comp_date',
                                                 'product_id',
                                                 'seller_id',
                                                 'min',
                                                 'price',
                                                 'max' ,
                                                 'difference'])

enc = LabelEncoder()
df['enc_comp_date'] = enc.fit_transform(df['comp_date'])

df["comp_date"] =  pd.to_datetime(df["comp_date"], format = "%Y/%m/%d")
df["quarter"] = df.comp_date.dt.quarter

In [207]:
## Dataset creation 

#Create a function that accepts as parameters the product_id and the quarter
def pivot(product_id, quarter):
    df_1 = df[(df.product_id == product_id) & (df.quarter == quarter)]
    table = pd.pivot_table(df_1, values='price', index=['comp_date'], columns=['seller_id'])
    table.fillna(method='ffill', inplace=True)
    return table

price_data = pivot(107645, 1)

In [209]:
price_data

seller_id,23,24,26,41,48,180,188,407,490
comp_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-01-01,1999.0,3990.0,3990.0,1750.0,3990.0,1779.0,1990.0,2299.0,
2021-01-02,1999.0,3990.0,3990.0,1750.0,3990.0,1839.0,1990.0,2299.0,
2021-01-03,1999.0,3990.0,3990.0,1750.0,3990.0,1899.0,1990.0,2299.0,
2021-01-04,1999.0,3990.0,3990.0,1750.0,3990.0,1899.0,1990.0,2299.0,
2021-01-05,1999.0,1990.0,2990.0,1750.0,1990.0,1899.0,1990.0,2299.0,
...,...,...,...,...,...,...,...,...,...
2021-03-27,1999.0,1990.0,1658.0,1850.0,1692.0,1895.0,1990.0,1599.0,1990.0
2021-03-28,1999.0,1990.0,1658.0,1850.0,1692.0,1895.0,1990.0,1599.0,1990.0
2021-03-29,1999.0,1990.0,1990.0,1850.0,1990.0,1895.0,1990.0,1744.5,1990.0
2021-03-30,1999.0,1990.0,1990.0,1850.0,1990.0,1895.0,1990.0,1890.0,1990.0


## Pearson Correlation

In [195]:
# Shift the original dataset by one row
price_data_shifted = price_data.shift(-1)

In [197]:
Pearson_correlation = []
Leader = []
Follower = []

#For each pair of sellers, correlation between them is computed 
#if correlation is >0.7 a possible leader-follower pair is detected 
for i in price_data_shifted.columns:
    for j in price_data.columns:
        if (i!=j and price_data_shifted[i].corr(price_data[j])>0.7):
            Leader.append(j)
            Follower.append(i)
            Pearson_correlation.append(price_data_shifted[i].corr(price_data[j])) 

In [198]:
#Co-integration computation
df_followers = price_data.drop(index = '2021-01-01')
df_leaders = price_data.drop(index = '2021-03-31')

Cointegration = []

for i in df_followers.columns:
    for j in df_leaders.columns:
        if (i!=j and i in Follower and j in Leader):
            Cointegration.append(coint(df_leaders[j], df_followers[i])[1])            

In [199]:
#Trasforming the lists into a pandas DataFrame
output_pearson = pd.DataFrame({'Leader':Leader, 'Follower':Follower,
                       'Pearson Correlation':Pearson_correlation, 'Co-Integration p-value':Cointegration})
output_pearson

Unnamed: 0,Leader,Follower,Pearson Correlation,Co-Integration p-value
0,26,24,0.849637,6.74245e-07
1,48,24,0.875885,0.008102065
2,24,26,0.74271,0.0005973242
3,48,26,0.782319,1.548132e-08
4,24,48,0.710591,0.001651746
5,26,48,0.735388,9.017176e-08


In [201]:
#Plot the prices of all sellers that appear in the output dataset

price_data_shifted.columns = price_data_shifted.columns.astype(str)
price_data.columns = price_data.columns.astype(str)

list = []

for z in output_pearson.Leader.unique():
    list.append(z)

for x in output_pearson.Follower.unique():
    if x not in lista:
        list.append(x)
        
        
data = price_data[list]
fig = px.line(data, x=data.index, y = data.columns)
fig.show()

## Granger Causality, Leader/Follower Detection

In [208]:
leader = []
follower = []

#Iterate over columns of the original dataset
for i in price_data.columns:
    for j in price_data.columns:
        
        #If columns are not referred to the same seller and if they are not constant
        if ((i != j) & \
            (price_data[i].std() != 0) & \
            (price_data[j].std() != 0)):
        #Perform the Granger Causality test
            c = grangercausalitytests(price_data[[i, j]], maxlag=3, verbose=False) #case in which i causes j 
            lista1 = []
            for h in c:
                lista1.append(c[h][0]['ssr_ftest'][1]) #append F-test's p-values to a list
                
            d = grangercausalitytests(price_data[[j, i]], maxlag=3, verbose=False) #case in which j causes i 
            lista2 = []
            for l in d:
                lista2.append(d[l][0]['ssr_ftest'][1]) #append F-test's p-values to a list
            
            lista3 = []
            
            if (min(lista1) or min(lista2)) < 0.05: #Consider only lists which have at least one p-value<0.05
                for p in range(len(lista1)):
                    #Compare the two lists of p-value: the one with the lowest p-values will refer to 
                    #the seller who is most likely to be a leader as it causes the other's prices to change
                    if (lista1[p] - lista2[p] < lista2[p] - lista1[p]):
                        lista3.append(True) 
                    elif (lista1[p] - lista2[p] >= lista2[p] - lista1[p]):
                        lista3.append(False)
            
            if len(lista3) == 0: #If lista3 is empty, it means that there was no p-value<0.05 for the considered pair of sellers
                leader.append('nonsignificant')
                follower.append('nonsignificant')
            else: 
                if max(set(lista3), key = lista3.count) == True: #if the most frequent value in the list is True
                    leader.append(i) 
                    follower.append(j)
                else:
                    leader.append(j)
                    follower.append(i)

#Transform the lists in a pandas DataFrame                    
output_granger = pd.DataFrame({'Leader':leader, 'Follower':follower})
output_granger.drop_duplicates(inplace = True)
output_granger = output_granger.drop(output_granger[output_granger.Leader == 'nonsignificant'].index).reset_index(drop = True)

print(output_granger)


#Plot the prices of all sellers that appear in the output dataset

lista = []

for z in output_granger.Leader.unique():
    lista.append(z)

for x in output.Follower.unique():
    if x not in lista:
        lista.append(x)
        
data = price_data[lista]
fig = px.line(data, x=data.index, y = data.columns)
fig.show()

  Leader Follower
0     24       26
1     24       48
2     26       48
3     41      180
