# Data Quality Study - US

## Requirements

In [None]:
pip install scipy pandas matplotlib sklearn

In [None]:
#importing necessary packages
from math import *
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
from sklearn.metrics import pairwise_distances #jaccard diss.
from sklearn import manifold  # multidimensional scaling
from sklearn.preprocessing import MinMaxScaler

In [None]:
mds_model = manifold.MDS(n_components = 2, random_state = 123,
    dissimilarity = 'precomputed',
    metric = True,
    max_iter = 1000)

## Daily Prices

### CSV reader

In [None]:
total_full = pd.read_csv (r'Input_Data_Quality_Anonymized/DailyPrices/Full/total.csv')
ratio_full = pd.read_csv (r'Input_Data_Quality_Anonymized/DailyPrices/Full/ratio.csv', index_col = 0)

ratio_malus_full = 1 - ratio_full

rmf_cluster = ratio_malus_full.drop("C28").drop(columns = "C28")
rmf_cluster = rmf_cluster.drop('C40').drop(columns = 'C40')
rmf_cluster = rmf_cluster.drop('C13').drop(columns = 'C13')
rmf_cluster = rmf_cluster.drop('C45').drop(columns = 'C45')

In [None]:
total_new = pd.read_csv (r'Input_Data_Quality_Anonymized/DailyPrices/21-Now/total.csv')
ratio_new = pd.read_csv (r'Input_Data_Quality_Anonymized/DailyPrices/21-Now/ratio.csv', index_col = 0)

ratio_malus_new = 1 - ratio_new

rmn_cluster = ratio_malus_new.drop("C28").drop(columns = "C28")
rmn_cluster = rmn_cluster.drop('C40').drop(columns = 'C40')
rmn_cluster = rmn_cluster.drop('C13').drop(columns = 'C13')
rmn_cluster = rmn_cluster.drop('C45').drop(columns = 'C45')

In [None]:
total_old = pd.read_csv (r'Input_Data_Quality_Anonymized/DailyPrices/90-10/total.csv')
ratio_old = pd.read_csv (r'Input_Data_Quality_Anonymized/DailyPrices/90-10/ratio.csv', index_col = 0)

ratio_malus_old = 1 - ratio_old

rmo_cluster = ratio_malus_old.drop("C28").drop(columns = "C28")
#rmo_cluster = rmo_cluster.drop('C40').drop(columns = 'C40')
rmo_cluster = rmo_cluster.drop('C13').drop(columns = 'C13')
#rmo_cluster = rmo_cluster.drop('C45').drop(columns = 'C45')

### Multidimensional Scaling - Complete

#### Parameters

In [None]:
mds_fit_full = mds_model.fit(ratio_malus_full.to_numpy())  
mds_coords_full = mds_model.fit_transform(ratio_malus_full.to_numpy()) 
                                                                                                                                  
source_names_full = ratio_malus_full.columns.tolist()

total_ordered_full = []
total_ordered_full = [total_full.loc[total_full['Provider'] == ticker]['Percentage']*1000 for ticker in source_names_full]

source_names_full = list(map(lambda orig_string: orig_string + "", ratio_malus_full.columns.tolist()))

In [None]:
mds_fit_new = mds_model.fit(ratio_malus_new.to_numpy())  
mds_coords_new = mds_model.fit_transform(ratio_malus_new.to_numpy()) 
                                                                                                                                  
source_names_new = ratio_malus_new.columns.tolist()

total_ordered_new = []
total_ordered_new = [total_new.loc[total_new['Provider'] == ticker]['Percentage']*1000 for ticker in source_names_new]

source_names_new = list(map(lambda orig_string: orig_string + "", ratio_malus_new.columns.tolist()))

In [None]:
mds_fit_old = mds_model.fit(ratio_malus_old.to_numpy())  
mds_coords_old = mds_model.fit_transform(ratio_malus_old.to_numpy()) 
                                                                                                                                  
source_names_old = ratio_malus_old.columns.tolist()

total_ordered_old = []
total_ordered_old = [total_old.loc[total_old['Provider'] == ticker]['Percentage']*1000 for ticker in source_names_old]

source_names_old = list(map(lambda orig_string: orig_string + "", ratio_malus_old.columns.tolist()))

#### Visualization

In [None]:
plt.figure(figsize = (15,15))

plt.scatter(mds_coords_full[:,0],mds_coords_full[:,1], total_ordered_full,
    facecolors = 'lightblue', edgecolors = 'blue')

plt.scatter(mds_coords_new[:,0],mds_coords_new[:,1], total_ordered_new,
    facecolors = 'lightgreen', edgecolors = 'green') 

plt.scatter(mds_coords_old[:,0],mds_coords_old[:,1], total_ordered_old,
    facecolors = 'yellow', edgecolors = 'orange') 

labels_full = source_names_full
for label, x, y in zip(labels_full, mds_coords_full[:,0], mds_coords_full[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
    
labels_new = source_names_new
for label, x, y in zip(labels_new, mds_coords_new[:,0], mds_coords_new[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
    
labels_old = source_names_old
for label, x, y in zip(labels_old, mds_coords_old[:,0], mds_coords_old[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
    

plt.xlabel('First Dimension')
plt.ylabel('Second Dimension')
plt.legend(["Full", "New", "Old"])
plt.title('DailyPrices Full View')    
plt.show()

In [None]:
plt.figure(figsize = (15,15))

plt.scatter(mds_coords_full[:,0],mds_coords_full[:,1], total_ordered_full,
    facecolors = 'lightblue', edgecolors = 'blue')

plt.scatter(mds_coords_new[:,0],mds_coords_new[:,1], total_ordered_new,
    facecolors = 'lightgreen', edgecolors = 'green') 

plt.scatter(mds_coords_old[:,0],mds_coords_old[:,1], total_ordered_old,
    facecolors = 'yellow', edgecolors = 'orange')

plt.xlabel('First Dimension')
plt.ylabel('Second Dimension')
plt.legend(["Full", "New", "Old"])
plt.title('DailyPrices Full View')    
plt.show()

### Multidimensional Scaling - Cluster

#### Parameters

In [None]:
mds_fit_c_full = mds_model.fit(rmf_cluster.to_numpy())  
mds_coords_c_full = mds_model.fit_transform(rmf_cluster.to_numpy()) 
                                                                                                                                  
source_names_c_full = rmf_cluster.columns.tolist()

total_ordered_c_full = []
total_ordered_c_full = [total_full.loc[total_full['Provider'] == ticker]['Percentage']*1000 for ticker in source_names_c_full]

source_names_c_full = list(map(lambda orig_string: orig_string + "", rmf_cluster.columns.tolist()))

In [None]:
mds_fit_c_new = mds_model.fit(rmn_cluster.to_numpy())  
mds_coords_c_new = mds_model.fit_transform(rmn_cluster.to_numpy()) 
                                                                                                                                  
source_names_c_new = rmn_cluster.columns.tolist()

total_ordered_c_new = []
total_ordered_c_new = [total_new.loc[total_new['Provider'] == ticker]['Percentage']*1000 for ticker in source_names_c_new]

source_names_c_new = list(map(lambda orig_string: orig_string + "", rmn_cluster.columns.tolist()))

In [None]:
mds_fit_c_old = mds_model.fit(rmo_cluster.to_numpy())  
mds_coords_c_old = mds_model.fit_transform(rmo_cluster.to_numpy()) 
                                                                                                                                  
source_names_c_old = rmo_cluster.columns.tolist()

total_ordered_c_old = []
total_ordered_c_old = [total_old.loc[total_old['Provider'] == ticker]['Percentage']*1000 for ticker in source_names_c_old]

source_names_c_old = list(map(lambda orig_string: orig_string + "", rmo_cluster.columns.tolist()))

In [None]:
print(mds_coords_c_old)

#### Visualization

In [None]:
plt.figure(figsize = (15,15))

plt.scatter(mds_coords_c_full[:,0],mds_coords_c_full[:,1], total_ordered_c_full,
    facecolors = 'lightblue', edgecolors = 'blue')

plt.scatter(mds_coords_c_new[:,0],mds_coords_c_new[:,1], total_ordered_c_new,
    facecolors = 'lightgreen', edgecolors = 'green') 

plt.scatter(mds_coords_c_old[:,0],mds_coords_c_old[:,1], total_ordered_c_old,
    facecolors = 'yellow', edgecolors = 'orange') 

labels_c_full = source_names_c_full
for label, x, y in zip(labels_c_full, mds_coords_c_full[:,0], mds_coords_c_full[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
    
labels_c_new = source_names_c_new
for label, x, y in zip(labels_c_new, mds_coords_c_new[:,0], mds_coords_c_new[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
    
labels_c_old = source_names_c_old
for label, x, y in zip(labels_c_old, mds_coords_c_old[:,0], mds_coords_c_old[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
    

plt.xlabel('First Dimension')
plt.ylabel('Second Dimension')
plt.legend(["Full", "New", "Old"])
plt.title('DailyPrices Cluster View')    
plt.show()

With only one annotation (for the post) : C31

In [None]:
plt.figure(figsize = (13,13))

plt.scatter(mds_coords_c_full[:,0],mds_coords_c_full[:,1], total_ordered_c_full, alpha=0.5,
    facecolors = 'lightblue', edgecolors = 'blue')

plt.scatter(mds_coords_c_new[:,0],mds_coords_c_new[:,1], total_ordered_c_new, alpha=0.5,
    facecolors = 'lightgreen', edgecolors = 'green') 

plt.scatter(mds_coords_c_old[:,0],mds_coords_c_old[:,1], total_ordered_c_old, alpha=0.5,
    facecolors = 'yellow', edgecolors = 'orange') 

    
labels_c_full = source_names_c_full
i = 19
plt.annotate(labels_c_full[i], (mds_coords_c_full[:,0][i],mds_coords_c_full[:,1][i]), xycoords = 'data')
    
labels_c_new = source_names_c_new
plt.annotate(labels_c_new[i-1], (mds_coords_c_new[:,0][i-1],mds_coords_c_new[:,1][i-1]), xycoords = 'data')
    
labels_c_old = source_names_c_old
plt.annotate(labels_c_old[i-8], (mds_coords_c_old[:,0][i-8],mds_coords_c_old[:,1][i-8]), xycoords = 'data')   

plt.scatter(mds_coords_c_full[:,0][i],mds_coords_c_full[:,1][i], total_ordered_c_full[i],
    facecolors = 'lightblue', edgecolors = 'black')

plt.scatter(mds_coords_c_new[:,0][i-1],mds_coords_c_new[:,1][i-1], total_ordered_c_new[i-1],
    facecolors = 'lightgreen', edgecolors = 'black') 

plt.scatter(mds_coords_c_old[:,0][i-8],mds_coords_c_old[:,1][i-8], total_ordered_c_old[i-8],
    facecolors = 'yellow', edgecolors = 'black') 
    

plt.xlabel('First Dimension')
plt.ylabel('Second Dimension')
plt.legend(["Full", "New", "Old"])
plt.title('DailyPrices Cluster View: focus on C31')    
plt.show()

In [None]:
plt.figure(figsize = (15,15))

plt.scatter(mds_coords_c_full[:,0],mds_coords_c_full[:,1], total_ordered_c_full,
    facecolors = 'lightblue', edgecolors = 'blue')

plt.scatter(mds_coords_c_new[:,0],mds_coords_c_new[:,1], total_ordered_c_new,
    facecolors = 'lightgreen', edgecolors = 'green') 

plt.scatter(mds_coords_c_old[:,0],mds_coords_c_old[:,1], total_ordered_c_old,
    facecolors = 'yellow', edgecolors = 'orange') 

plt.xlabel('First Dimension')
plt.ylabel('Second Dimension')
plt.legend(["Full", "New", "Old"])
plt.title('DailyPrices Cluster View')    
plt.show()

## Bars 

### CSV reader

In [None]:
bars_total_full = pd.read_csv (r'Input_Data_Quality_Anonymized/Bars/Full/total.csv')
bars_ratio_full = pd.read_csv (r'Input_Data_Quality_Anonymized/Bars/Full/ratio.csv', index_col = 0)

bars_ratio_malus_full = 1 - bars_ratio_full

bars_rmf_cluster = bars_ratio_malus_full.drop("C28").drop(columns = "C28")
bars_rmf_cluster = bars_rmf_cluster.drop('C40').drop(columns = 'C40')
bars_rmf_cluster = bars_rmf_cluster.drop('C13').drop(columns = 'C13')
bars_rmf_cluster = bars_rmf_cluster.drop('C45').drop(columns = 'C45')

In [None]:
bars_total_new = pd.read_csv (r'Input_Data_Quality_Anonymized/Bars/21-Now/total.csv')
bars_ratio_new = pd.read_csv (r'Input_Data_Quality_Anonymized/Bars/21-Now/ratio.csv', index_col = 0)

bars_ratio_malus_new = 1 - bars_ratio_new

bars_rmn_cluster = bars_ratio_malus_new.drop("C28").drop(columns = "C28")
bars_rmn_cluster = bars_rmn_cluster.drop('C40').drop(columns = 'C40')
bars_rmn_cluster = bars_rmn_cluster.drop('C13').drop(columns = 'C13')
bars_rmn_cluster = bars_rmn_cluster.drop('C45').drop(columns = 'C45')

In [None]:
bars_total_old = pd.read_csv (r'Input_Data_Quality_Anonymized/Bars/90-10/total.csv')
bars_ratio_old = pd.read_csv (r'Input_Data_Quality_Anonymized/Bars/90-10/ratio.csv', index_col = 0)

bars_ratio_malus_old = 1 - bars_ratio_old

bars_rmo_cluster = bars_ratio_malus_old.drop("C28").drop(columns = "C28")
#bars_rmo_cluster = bars_rmo_cluster.drop('C40').drop(columns = 'C40')
bars_rmo_cluster = bars_rmo_cluster.drop('C13').drop(columns = 'C13')
#bars_rmo_cluster = bars_rmo_cluster.drop('C45').drop(columns = 'C45')

### Multidimensional Scaling - Complete

#### Parameters

In [None]:
bars_mds_fit_full = mds_model.fit(bars_ratio_malus_full.to_numpy())  
bars_mds_coords_full = mds_model.fit_transform(bars_ratio_malus_full.to_numpy()) 
                                                                                                                                  
bars_source_names_full = bars_ratio_malus_full.columns.tolist()

bars_total_ordered_full = []
bars_total_ordered_full = [bars_total_full.loc[bars_total_full['Provider'] == ticker]['Percentage']*1000 for ticker in bars_source_names_full]

bars_source_names_full = list(map(lambda orig_string: orig_string + "", bars_ratio_malus_full.columns.tolist()))

In [None]:
bars_mds_fit_new = mds_model.fit(bars_ratio_malus_new.to_numpy())  
bars_mds_coords_new = mds_model.fit_transform(bars_ratio_malus_new.to_numpy()) 
                                                                                                                                  
bars_source_names_new = bars_ratio_malus_new.columns.tolist()

bars_total_ordered_new = []
bars_total_ordered_new = [bars_total_new.loc[bars_total_new['Provider'] == ticker]['Percentage']*1000 for ticker in bars_source_names_new]

bars_source_names_new = list(map(lambda orig_string: orig_string + "", bars_ratio_malus_new.columns.tolist()))

In [None]:
bars_mds_fit_old = mds_model.fit(bars_ratio_malus_old.to_numpy())  
bars_mds_coords_old = mds_model.fit_transform(bars_ratio_malus_old.to_numpy()) 
                                                                                                                                  
bars_source_names_old = bars_ratio_malus_old.columns.tolist()

bars_total_ordered_old = []
bars_total_ordered_old = [bars_total_old.loc[bars_total_old['Provider'] == ticker]['Percentage']*1000 for ticker in bars_source_names_old]

bars_source_names_old = list(map(lambda orig_string: orig_string + "", bars_ratio_malus_old.columns.tolist()))

#### Visualization

In [None]:
plt.figure(figsize = (15,15))

plt.scatter(bars_mds_coords_full[:,0], bars_mds_coords_full[:,1], bars_total_ordered_full,
    facecolors = 'lightblue', edgecolors = 'blue')

plt.scatter(bars_mds_coords_new[:,0], bars_mds_coords_new[:,1], bars_total_ordered_new,
    facecolors = 'lightgreen', edgecolors = 'green') 

plt.scatter(bars_mds_coords_old[:,0], bars_mds_coords_old[:,1], bars_total_ordered_old,
    facecolors = 'yellow', edgecolors = 'orange') 

bars_labels_full = bars_source_names_full
for label, x, y in zip(bars_labels_full, bars_mds_coords_full[:,0], bars_mds_coords_full[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
    
bars_labels_new = bars_source_names_new
for label, x, y in zip(bars_labels_new, bars_mds_coords_new[:,0], bars_mds_coords_new[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
    
bars_labels_old = bars_source_names_old
for label, x, y in zip(bars_labels_old, bars_mds_coords_old[:,0], bars_mds_coords_old[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
    

plt.xlabel('First Dimension')
plt.ylabel('Second Dimension')
plt.legend(["Full", "New", "Old"])
plt.title('Bars Full View')    
plt.show()

In [None]:
plt.figure(figsize = (15,15))

plt.scatter(bars_mds_coords_full[:,0], bars_mds_coords_full[:,1], bars_total_ordered_full,
    facecolors = 'lightblue', edgecolors = 'blue')

plt.scatter(bars_mds_coords_new[:,0], bars_mds_coords_new[:,1], bars_total_ordered_new,
    facecolors = 'lightgreen', edgecolors = 'green') 

plt.scatter(bars_mds_coords_old[:,0], bars_mds_coords_old[:,1], bars_total_ordered_old,
    facecolors = 'yellow', edgecolors = 'orange')

plt.xlabel('First Dimension')
plt.ylabel('Second Dimension')
plt.legend(["Full", "New", "Old"])
plt.title('Bars Full View')    
plt.show()

### Multidimensional Scaling - Cluster

#### Parameters

In [None]:
bars_mds_fit_c_full = mds_model.fit(bars_rmf_cluster.to_numpy())  
bars_mds_coords_c_full = mds_model.fit_transform(bars_rmf_cluster.to_numpy()) 
                                                                                                                                  
bars_source_names_c_full = bars_rmf_cluster.columns.tolist()

bars_total_ordered_c_full = []
bars_total_ordered_c_full = [bars_total_full.loc[bars_total_full['Provider'] == ticker]['Percentage']*1000 for ticker in bars_source_names_c_full]

bars_source_names_c_full = list(map(lambda orig_string: orig_string + "", bars_rmf_cluster.columns.tolist()))

In [None]:
bars_mds_fit_c_new = mds_model.fit(bars_rmn_cluster.to_numpy())  
bars_mds_coords_c_new = mds_model.fit_transform(bars_rmn_cluster.to_numpy()) 
                                                                                                                                  
bars_source_names_c_new = bars_rmn_cluster.columns.tolist()

bars_total_ordered_c_new = []
bars_total_ordered_c_new = [bars_total_new.loc[bars_total_new['Provider'] == ticker]['Percentage']*1000 for ticker in bars_source_names_c_new]

bars_source_names_c_new = list(map(lambda orig_string: orig_string + "", bars_rmn_cluster.columns.tolist()))

In [None]:
bars_mds_fit_c_old = mds_model.fit(bars_rmo_cluster.to_numpy())  
bars_mds_coords_c_old = mds_model.fit_transform(bars_rmo_cluster.to_numpy()) 
                                                                                                                                  
bars_source_names_c_old = bars_rmo_cluster.columns.tolist()

bars_total_ordered_c_old = []
bars_total_ordered_c_old = [bars_total_old.loc[bars_total_old['Provider'] == ticker]['Percentage']*1000 for ticker in bars_source_names_c_old]

bars_source_names_c_old = list(map(lambda orig_string: orig_string + "", bars_rmo_cluster.columns.tolist()))

In [None]:
print(bars_mds_coords_c_old)

#### Visualization

In [None]:
plt.figure(figsize = (15,15))

plt.scatter(bars_mds_coords_c_full[:,0],bars_mds_coords_c_full[:,1], bars_total_ordered_c_full,
    facecolors = 'lightblue', edgecolors = 'blue')

plt.scatter(bars_mds_coords_c_new[:,0],bars_mds_coords_c_new[:,1], bars_total_ordered_c_new,
    facecolors = 'lightgreen', edgecolors = 'green') 

plt.scatter(bars_mds_coords_c_old[:,0],bars_mds_coords_c_old[:,1], bars_total_ordered_c_old,
    facecolors = 'yellow', edgecolors = 'orange') 

bars_labels_c_full = bars_source_names_c_full
for label, x, y in zip(bars_labels_c_full, bars_mds_coords_c_full[:,0], bars_mds_coords_c_full[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
    
bars_labels_c_new = bars_source_names_c_new
for label, x, y in zip(bars_labels_c_new, bars_mds_coords_c_new[:,0], bars_mds_coords_c_new[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
    
bars_labels_c_old = bars_source_names_c_old
for label, x, y in zip(bars_labels_c_old, bars_mds_coords_c_old[:,0], bars_mds_coords_c_old[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
    

plt.xlabel('First Dimension')
plt.ylabel('Second Dimension')
plt.legend(["Full", "New", "Old"])
plt.title('Bars Cluster View')    
plt.show()

## Dividends

### CSV reader

In [None]:
div_total_full = pd.read_csv (r'Input_Data_Quality_Anonymized/Dividends/Full/total.csv')
div_ratio_full = pd.read_csv (r'Input_Data_Quality_Anonymized/Dividends/Full/ratio.csv', index_col = 0)

div_ratio_malus_full = 1 - div_ratio_full

#div_rmf_cluster = div_ratio_malus_full.drop("C28").drop(columns = "C28")
#div_rmf_cluster = div_ratio_malus_full.drop('C40').drop(columns = 'C40')
#div_rmf_cluster = div_ratio_malus_full.drop('C13').drop(columns = 'C13')
div_rmf_cluster = div_ratio_malus_full.drop('C45').drop(columns = 'C45')

In [None]:
div_total_new = pd.read_csv (r'Input_Data_Quality_Anonymized/Dividends/21-Now/total.csv')
div_ratio_new = pd.read_csv (r'Input_Data_Quality_Anonymized/Dividends/21-Now/ratio.csv', index_col = 0)

div_ratio_malus_new = 1 - div_ratio_new

#div_rmn_cluster = div_ratio_malus_new.drop("C28").drop(columns = "C28")
#div_rmn_cluster = div_rmn_cluster.drop('C40').drop(columns = 'C40')
#div_rmn_cluster = div_rmn_cluster.drop('C13').drop(columns = 'C13')
div_rmn_cluster = div_ratio_malus_new.drop('C45').drop(columns = 'C45')

In [None]:
div_total_old = pd.read_csv (r'Input_Data_Quality_Anonymized/Dividends/90-10/total.csv')
div_ratio_old = pd.read_csv (r'Input_Data_Quality_Anonymized/Dividends/90-10/ratio.csv', index_col = 0)

div_ratio_malus_old = 1 - div_ratio_old

#div_rmo_cluster = div_ratio_malus_old.drop("C28").drop(columns = "C28")
#div_rmo_cluster = div_rmo_cluster.drop('C40').drop(columns = 'C40')
#div_rmo_cluster = div_rmo_cluster.drop('C13').drop(columns = 'C13')
div_rmo_cluster = div_ratio_malus_old.drop('C45').drop(columns = 'C45')

### Multidimensional Scaling - Complete

#### Parameters

In [None]:
div_mds_fit_full = mds_model.fit(div_ratio_malus_full.to_numpy())  
div_mds_coords_full = mds_model.fit_transform(div_ratio_malus_full.to_numpy()) 
                                                                                                                                  
div_source_names_full = div_ratio_malus_full.columns.tolist()

div_total_ordered_full = []
div_total_ordered_full = [div_total_full.loc[div_total_full['Provider'] == ticker]['Percentage']*1000 for ticker in div_source_names_full]

div_source_names_full = list(map(lambda orig_string: orig_string + "", div_ratio_malus_full.columns.tolist()))

In [None]:
div_mds_fit_new = mds_model.fit(div_ratio_malus_new.to_numpy())  
div_mds_coords_new = mds_model.fit_transform(div_ratio_malus_new.to_numpy()) 
                                                                                                                                  
div_source_names_new = div_ratio_malus_new.columns.tolist()

div_total_ordered_new = []
div_total_ordered_new = [div_total_new.loc[div_total_new['Provider'] == ticker]['Percentage']*1000 for ticker in div_source_names_new]

div_source_names_new = list(map(lambda orig_string: orig_string + "", div_ratio_malus_new.columns.tolist()))

In [None]:
div_mds_fit_old = mds_model.fit(div_ratio_malus_old.to_numpy())  
div_mds_coords_old = mds_model.fit_transform(div_ratio_malus_old.to_numpy()) 
                                                                                                                                  
div_source_names_old = div_ratio_malus_old.columns.tolist()

div_total_ordered_old = []
div_total_ordered_old = [div_total_old.loc[div_total_old['Provider'] == ticker]['Percentage']*1000 for ticker in div_source_names_old]

div_source_names_old = list(map(lambda orig_string: orig_string + "", div_ratio_malus_old.columns.tolist()))

#### Visualization

In [None]:
plt.figure(figsize = (15,15))

plt.scatter(div_mds_coords_full[:,0],div_mds_coords_full[:,1], div_total_ordered_full,
    facecolors = 'lightblue', edgecolors = 'blue')

plt.scatter(div_mds_coords_new[:,0],div_mds_coords_new[:,1], div_total_ordered_new,
    facecolors = 'lightgreen', edgecolors = 'green') 

plt.scatter(div_mds_coords_old[:,0], div_mds_coords_old[:,1], div_total_ordered_old,
    facecolors = 'yellow', edgecolors = 'orange') 

div_labels_full = div_source_names_full
for label, x, y in zip(div_labels_full, div_mds_coords_full[:,0], div_mds_coords_full[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
    
div_labels_new = div_source_names_new
for label, x, y in zip(div_labels_new, div_mds_coords_new[:,0], div_mds_coords_new[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
    
div_labels_old = div_source_names_old
for label, x, y in zip(div_labels_old, div_mds_coords_old[:,0], div_mds_coords_old[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
    

plt.xlabel('First Dimension')
plt.ylabel('Second Dimension')
plt.legend(["Full", "New", "Old"])
plt.title('Dividends Full View')    
plt.show()

In [None]:
plt.figure(figsize = (15,15))

plt.scatter(div_mds_coords_full[:,0],div_mds_coords_full[:,1], div_total_ordered_full,
    facecolors = 'lightblue', edgecolors = 'blue')

plt.scatter(div_mds_coords_new[:,0],div_mds_coords_new[:,1], div_total_ordered_new,
    facecolors = 'lightgreen', edgecolors = 'green') 

plt.scatter(div_mds_coords_old[:,0],div_mds_coords_old[:,1], div_total_ordered_old,
    facecolors = 'yellow', edgecolors = 'orange')

plt.xlabel('First Dimension')
plt.ylabel('Second Dimension')
plt.legend(["Full", "New", "Old"])
plt.title('Dividends Full View')    
plt.show()

### Multidimensional Scaling - Cluster

#### Parameters

In [None]:
div_mds_fit_c_full = mds_model.fit(div_rmf_cluster.to_numpy())  
div_mds_coords_c_full = mds_model.fit_transform(div_rmf_cluster.to_numpy()) 
                                                                                                                                  
div_source_names_c_full = div_rmf_cluster.columns.tolist()

div_total_ordered_c_full = []
div_total_ordered_c_full = [div_total_full.loc[div_total_full['Provider'] == ticker]['Percentage']*1000 for ticker in div_source_names_c_full]

div_source_names_c_full = list(map(lambda orig_string: orig_string + "", div_rmf_cluster.columns.tolist()))

In [None]:
div_mds_fit_c_new = mds_model.fit(div_rmn_cluster.to_numpy())  
div_mds_coords_c_new = mds_model.fit_transform(div_rmn_cluster.to_numpy()) 
                                                                                                                                  
div_source_names_c_new = div_rmn_cluster.columns.tolist()

div_total_ordered_c_new = []
div_total_ordered_c_new = [div_total_new.loc[div_total_new['Provider'] == ticker]['Percentage']*1000 for ticker in div_source_names_c_new]

div_source_names_c_new = list(map(lambda orig_string: orig_string + "", div_rmn_cluster.columns.tolist()))

In [None]:
div_mds_fit_c_old = mds_model.fit(div_rmo_cluster.to_numpy())  
div_mds_coords_c_old = mds_model.fit_transform(div_rmo_cluster.to_numpy()) 
                                                                                                                                  
div_source_names_c_old = div_rmo_cluster.columns.tolist()

div_total_ordered_c_old = []
div_total_ordered_c_old = [div_total_old.loc[div_total_old['Provider'] == ticker]['Percentage']*1000 for ticker in div_source_names_c_old]

div_source_names_c_old = list(map(lambda orig_string: orig_string + "", div_rmo_cluster.columns.tolist()))

In [None]:
print(div_mds_coords_c_old)

#### Visualization

In [None]:
plt.figure(figsize = (15,15))

plt.scatter(div_mds_coords_c_full[:,0],div_mds_coords_c_full[:,1], div_total_ordered_c_full,
    facecolors = 'lightblue', edgecolors = 'blue')

plt.scatter(div_mds_coords_c_new[:,0],div_mds_coords_c_new[:,1], div_total_ordered_c_new,
    facecolors = 'lightgreen', edgecolors = 'green') 

plt.scatter(div_mds_coords_c_old[:,0],div_mds_coords_c_old[:,1], div_total_ordered_c_old,
    facecolors = 'yellow', edgecolors = 'orange') 

div_labels_c_full = div_source_names_c_full
for label, x, y in zip(div_labels_c_full, div_mds_coords_c_full[:,0], div_mds_coords_c_full[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
    
div_labels_c_new = div_source_names_c_new
for label, x, y in zip(div_labels_c_new, div_mds_coords_c_new[:,0], div_mds_coords_c_new[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
    
div_labels_c_old = div_source_names_c_old
for label, x, y in zip(div_labels_c_old, div_mds_coords_c_old[:,0], div_mds_coords_c_old[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
    

plt.xlabel('First Dimension')
plt.ylabel('Second Dimension')
plt.legend(["Full", "New", "Old"])
plt.title('Dividends Cluster View')    
plt.show()

In [None]:
plt.figure(figsize = (15,15))

plt.scatter(div_mds_coords_c_full[:,0],div_mds_coords_c_full[:,1], div_total_ordered_c_full,
    facecolors = 'lightblue', edgecolors = 'blue')

plt.scatter(div_mds_coords_c_new[:,0], div_mds_coords_c_new[:,1], div_total_ordered_c_new,
    facecolors = 'lightgreen', edgecolors = 'green') 

plt.scatter(div_mds_coords_c_old[:,0],div_mds_coords_c_old[:,1], div_total_ordered_c_old,
    facecolors = 'yellow', edgecolors = 'orange') 

plt.xlabel('First Dimension')
plt.ylabel('Second Dimension')
plt.legend(["Full", "New", "Old"])
plt.title('Dividends Cluster View')    
plt.show()

## Split

### CSV reader

In [None]:
split_total_full = pd.read_csv (r'Input_Data_Quality_Anonymized/Splits/Full/total.csv')
split_ratio_full = pd.read_csv (r'Input_Data_Quality_Anonymized/Splits/Full/ratio.csv', index_col = 0)

split_ratio_malus_full = 1 - split_ratio_full

split_rmf_cluster = split_ratio_malus_full.drop("C28").drop(columns = "C28")
#split_rmf_cluster = split_rmf_cluster.drop('C40').drop(columns = 'C40')
#split_rmf_cluster = split_rmf_cluster.drop('C13').drop(columns = 'C13')
split_rmf_cluster = split_rmf_cluster.drop('C45').drop(columns = 'C45')

In [None]:
split_total_new = pd.read_csv (r'Input_Data_Quality_Anonymized/Splits/21-Now/total.csv')
split_ratio_new = pd.read_csv (r'Input_Data_Quality_Anonymized/Splits/21-Now/ratio.csv', index_col = 0)

split_ratio_malus_new = 1 - split_ratio_new

split_rmn_cluster = split_ratio_malus_new.drop("C28").drop(columns = "C28")
#split_rmn_cluster = split_rmn_cluster.drop('C40').drop(columns = 'C40')
#split_rmn_cluster = split_rmn_cluster.drop('C13').drop(columns = 'C13')
split_rmn_cluster = split_rmn_cluster.drop('C45').drop(columns = 'C45')

In [None]:
split_total_old = pd.read_csv (r'Input_Data_Quality_Anonymized/Splits/90-10/total.csv')
split_ratio_old = pd.read_csv (r'Input_Data_Quality_Anonymized/Splits/90-10/ratio.csv', index_col = 0)

split_ratio_malus_old = 1 - split_ratio_old

split_rmo_cluster = split_ratio_malus_old.drop("C28").drop(columns = "C28")
#split_rmo_cluster = split_rmo_cluster.drop('C40').drop(columns = 'C40')
#split_rmo_cluster = split_rmo_cluster.drop('C13').drop(columns = 'C13')
split_rmo_cluster = split_rmo_cluster.drop('C45').drop(columns = 'C45')

### Multidimensional Scaling - Complete

#### Parameters

In [None]:
split_mds_fit_full = mds_model.fit(split_ratio_malus_full.to_numpy())  
split_mds_coords_full = mds_model.fit_transform(split_ratio_malus_full.to_numpy()) 
                                                                                                                                  
split_source_names_full = split_ratio_malus_full.columns.tolist()

split_total_ordered_full = []
split_total_ordered_full = [split_total_full.loc[split_total_full['Provider'] == ticker]['Percentage']*1000 for ticker in split_source_names_full]

split_source_names_full = list(map(lambda orig_string: orig_string + "", split_ratio_malus_full.columns.tolist()))

In [None]:
split_mds_fit_new = mds_model.fit(split_ratio_malus_new.to_numpy())  
split_mds_coords_new = mds_model.fit_transform(split_ratio_malus_new.to_numpy()) 
                                                                                                                                  
split_source_names_new = split_ratio_malus_new.columns.tolist()

split_total_ordered_new = []
split_total_ordered_new = [split_total_new.loc[split_total_new['Provider'] == ticker]['Percentage']*1000 for ticker in split_source_names_new]

split_source_names_new = list(map(lambda orig_string: orig_string + "", split_ratio_malus_new.columns.tolist()))

In [None]:
split_mds_fit_old = mds_model.fit(split_ratio_malus_old.to_numpy())  
split_mds_coords_old = mds_model.fit_transform(split_ratio_malus_old.to_numpy()) 
                                                                                                                                  
split_source_names_old = split_ratio_malus_old.columns.tolist()

split_total_ordered_old = []
split_total_ordered_old = [split_total_old.loc[split_total_old['Provider'] == ticker]['Percentage']*1000 for ticker in split_source_names_old]

split_source_names_old = list(map(lambda orig_string: orig_string + "", split_ratio_malus_old.columns.tolist()))

#### Visualization

In [None]:
plt.figure(figsize = (15,15))

plt.scatter(split_mds_coords_full[:,0],split_mds_coords_full[:,1], split_total_ordered_full,
    facecolors = 'lightblue', edgecolors = 'blue')

plt.scatter(split_mds_coords_new[:,0],split_mds_coords_new[:,1], split_total_ordered_new,
    facecolors = 'lightgreen', edgecolors = 'green') 

plt.scatter(split_mds_coords_old[:,0], split_mds_coords_old[:,1], split_total_ordered_old,
    facecolors = 'yellow', edgecolors = 'orange') 

split_labels_full = split_source_names_full
for label, x, y in zip(split_labels_full, split_mds_coords_full[:,0], split_mds_coords_full[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
    
split_labels_new = split_source_names_new
for label, x, y in zip(split_labels_new, split_mds_coords_new[:,0], split_mds_coords_new[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
    
split_labels_old = split_source_names_old
for label, x, y in zip(split_labels_old, split_mds_coords_old[:,0], split_mds_coords_old[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
    

plt.xlabel('First Dimension')
plt.ylabel('Second Dimension')
plt.legend(["Full", "New", "Old"])
plt.title('Splits Full View')    
plt.show()

In [None]:
plt.figure(figsize = (15,15))

plt.scatter(split_mds_coords_full[:,0],split_mds_coords_full[:,1], split_total_ordered_full,
    facecolors = 'lightblue', edgecolors = 'blue')

plt.scatter(split_mds_coords_new[:,0],split_mds_coords_new[:,1], split_total_ordered_new,
    facecolors = 'lightgreen', edgecolors = 'green') 

plt.scatter(split_mds_coords_old[:,0],split_mds_coords_old[:,1], split_total_ordered_old,
    facecolors = 'yellow', edgecolors = 'orange')

plt.xlabel('First Dimension')
plt.ylabel('Second Dimension')
plt.legend(["Full", "New", "Old"])
plt.title('Splits Full View')    
plt.show()

### Multidimensional Scaling - Cluster

#### Parameters

In [None]:
split_mds_fit_c_full = mds_model.fit(split_rmf_cluster.to_numpy())  
split_mds_coords_c_full = mds_model.fit_transform(split_rmf_cluster.to_numpy()) 
                                                                                                                                  
split_source_names_c_full = split_rmf_cluster.columns.tolist()

split_total_ordered_c_full = []
split_total_ordered_c_full = [split_total_full.loc[split_total_full['Provider'] == ticker]['Percentage']*1000 for ticker in split_source_names_c_full]

split_source_names_c_full = list(map(lambda orig_string: orig_string + "", split_rmf_cluster.columns.tolist()))

In [None]:
split_mds_fit_c_new = mds_model.fit(split_rmn_cluster.to_numpy())  
split_mds_coords_c_new = mds_model.fit_transform(split_rmn_cluster.to_numpy()) 
                                                                                                                                  
split_source_names_c_new = split_rmn_cluster.columns.tolist()

split_total_ordered_c_new = []
split_total_ordered_c_new = [split_total_new.loc[split_total_new['Provider'] == ticker]['Percentage']*1000 for ticker in split_source_names_c_new]

split_source_names_c_new = list(map(lambda orig_string: orig_string + "", split_rmn_cluster.columns.tolist()))

In [None]:
split_mds_fit_c_old = mds_model.fit(split_rmo_cluster.to_numpy())  
split_mds_coords_c_old = mds_model.fit_transform(split_rmo_cluster.to_numpy()) 
                                                                                                                                  
split_source_names_c_old = split_rmo_cluster.columns.tolist()

split_total_ordered_c_old = []
split_total_ordered_c_old = [split_total_old.loc[split_total_old['Provider'] == ticker]['Percentage']*1000 for ticker in split_source_names_c_old]

split_source_names_c_old = list(map(lambda orig_string: orig_string + "", split_rmo_cluster.columns.tolist()))

In [None]:
print(split_mds_coords_c_old)

#### Visualization

In [None]:
plt.figure(figsize = (15,15))

plt.scatter(split_mds_coords_c_full[:,0],split_mds_coords_c_full[:,1], split_total_ordered_c_full,
    facecolors = 'lightblue', edgecolors = 'blue')

plt.scatter(split_mds_coords_c_new[:,0],split_mds_coords_c_new[:,1], split_total_ordered_c_new,
    facecolors = 'lightgreen', edgecolors = 'green') 

plt.scatter(split_mds_coords_c_old[:,0],split_mds_coords_c_old[:,1], split_total_ordered_c_old,
    facecolors = 'yellow', edgecolors = 'orange') 

split_labels_c_full = split_source_names_c_full
for label, x, y in zip(split_labels_c_full, split_mds_coords_c_full[:,0], split_mds_coords_c_full[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
    
split_labels_c_new = split_source_names_c_new
for label, x, y in zip(split_labels_c_new, split_mds_coords_c_new[:,0], split_mds_coords_c_new[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
    
split_labels_c_old = split_source_names_c_old
for label, x, y in zip(split_labels_c_old, split_mds_coords_c_old[:,0], split_mds_coords_c_old[:,1]):
    plt.annotate(label, (x,y), xycoords = 'data')
    

plt.xlabel('First Dimension')
plt.ylabel('Second Dimension')
plt.legend(["Full", "New", "Old"])
plt.title('Splits Cluster View')    
plt.show()

In [None]:
plt.figure(figsize = (15,15))

plt.scatter(split_mds_coords_c_full[:,0],split_mds_coords_c_full[:,1], split_total_ordered_c_full,
    facecolors = 'lightblue', edgecolors = 'blue')

plt.scatter(split_mds_coords_c_new[:,0], split_mds_coords_c_new[:,1], split_total_ordered_c_new,
    facecolors = 'lightgreen', edgecolors = 'green') 

plt.scatter(split_mds_coords_c_old[:,0],split_mds_coords_c_old[:,1], split_total_ordered_c_old,
    facecolors = 'yellow', edgecolors = 'orange') 

plt.xlabel('First Dimension')
plt.ylabel('Second Dimension')
plt.legend(["Full", "New", "Old"])
plt.title('Split Cluster View')    
plt.show()

## Export

In [None]:
mds_coords_full[1][0]

In [None]:
print(div_source_names_full)

### DailyPrices

In [None]:
# une scénario
dailyprices_full = {}
for index, name in zip(mds_coords_full, source_names_full):
    dailyprices_full[name] = sqrt(index[0]**2 + index[1]**2)

dp_full = pd.DataFrame.from_dict(dailyprices_full, orient='index')
dp_full.rename(columns = {0:'DP Full'}, inplace = True)

In [None]:
dailyprices_new = {}
for index, name in zip(mds_coords_new, source_names_new):
    dailyprices_new[name] = sqrt(index[0]**2 + index[1]**2)

dp_new = pd.DataFrame.from_dict(dailyprices_new, orient='index')
dp_new.rename(columns = {0:'DP New'}, inplace = True)

In [None]:
dailyprices_old = {}
for index, name in zip(mds_coords_old, source_names_old):
    dailyprices_old[name] = sqrt(index[0]**2 + index[1]**2)

dp_old = pd.DataFrame.from_dict(dailyprices_old, orient='index')
dp_old.rename(columns = {0:'DP Old'}, inplace = True)

In [None]:
dp = pd.concat([dp_full, dp_new, dp_old], axis = 1)

### Bars

In [None]:
# une scénario
bars_full = {}
for index, name in zip(bars_mds_coords_full, bars_source_names_full):
    bars_full[name] = sqrt(index[0]**2 + index[1]**2)

bars_full = pd.DataFrame.from_dict(bars_full, orient='index')
bars_full.rename(columns = {0:'Bars Full'}, inplace = True)

In [None]:
bars_new = {}
for index, name in zip(bars_mds_coords_new, bars_source_names_new):
    bars_new[name] = sqrt(index[0]**2 + index[1]**2)

bars_new = pd.DataFrame.from_dict(bars_new, orient='index')
bars_new.rename(columns = {0:'Bars New'}, inplace = True)

In [None]:
bars_old = {}
for index, name in zip(bars_mds_coords_old, bars_source_names_old):
    bars_old[name] = sqrt(index[0]**2 + index[1]**2)

bars_old = pd.DataFrame.from_dict(bars_old, orient='index')
bars_old.rename(columns = {0:'Bars Old'}, inplace = True)

In [None]:
bars = pd.concat([bars_full, bars_new, bars_old], axis = 1)

### Dividends

In [None]:
# une scénario
div_full = {}
for index, name in zip(div_mds_coords_full, div_source_names_full):
    div_full[name] = sqrt(index[0]**2 + index[1]**2)

div_full = pd.DataFrame.from_dict(div_full, orient='index')
div_full.rename(columns = {0:'Dividends Full'}, inplace = True)

In [None]:
div_new = {}
for index, name in zip(div_mds_coords_new, div_source_names_new):
    div_new[name] = sqrt(index[0]**2 + index[1]**2)

div_new = pd.DataFrame.from_dict(div_new, orient='index')
div_new.rename(columns = {0:'Dividends New'}, inplace = True)

In [None]:
div_old = {}
for index, name in zip(div_mds_coords_old, div_source_names_old):
    div_old[name] = sqrt(index[0]**2 + index[1]**2)

div_old = pd.DataFrame.from_dict(div_old, orient='index')
div_old.rename(columns = {0:'Dividends Old'}, inplace = True)

In [None]:
div = pd.concat([div_full, div_new, div_old], axis = 1)

### Split

In [None]:
# une scénario
split_full = {}
for index, name in zip(split_mds_coords_full, split_source_names_full):
    split_full[name] = sqrt(index[0]**2 + index[1]**2)

split_full = pd.DataFrame.from_dict(split_full, orient='index')
split_full.rename(columns = {0:'Splits Full'}, inplace = True)

In [None]:
split_new = {}
for index, name in zip(split_mds_coords_new, split_source_names_new):
    split_new[name] = sqrt(index[0]**2 + index[1]**2)

split_new = pd.DataFrame.from_dict(split_new, orient='index')
split_new.rename(columns = {0:'Splits New'}, inplace = True)

In [None]:
split_old = {}
for index, name in zip(split_mds_coords_old, split_source_names_old):
    split_old[name] = sqrt(index[0]**2 + index[1]**2)

split_old = pd.DataFrame.from_dict(split_old, orient='index')
split_old.rename(columns = {0:'Splits Old'}, inplace = True)

In [None]:
split = pd.concat([split_full, split_new, split_old], axis = 1)

### Merge

In [None]:
data_quality = pd.concat([dp, bars, div, split], axis = 1)

In [None]:
data_quality

In [None]:
data_quality.to_csv('data_quality_export_anonymized.csv', index=True)