# Real world Data

## Data source 

Basically, the dataset tree is the following:

- Region
    - Country
        - Industry
            - Client 
                - Products
                
## Variables name

|  Variable name        |  Type   |
| --------------------- | ------- |
| ID                    |  object |
| Country_name          | object  |
| country_ref           | object  |
| Region                | object  |
| industry              | object  |
| A_reference           | float64 |
| A_market              | float64 |
| B_reference           | float64 |
| B_market              | float64 |
| C_reference           | float64 |
| C_market              | float64 |
| D_reference           | float64 |
| D_market              | float64 |
| E_reference           | float64 |
| E_market              | float64 |
| Total_reference       | float64 |
| Total_market          | float64 |

Note that, the market includes the reference

In [8]:
import pandas as pd
#import plotly.express as px
import numpy as np

import seaborn as sns

cm = sns.light_palette("green", as_cmap=True)

In [7]:
df_final = pd.read_csv('dataPandasClass.gz')
df_final.tail()

Unnamed: 0,ID,Country_name,country_ref,Region,industry,A_reference,A_market,B_reference,B_market,C_reference,C_market,D_reference,D_market,E_reference,E_market,Total_reference,Total_market
17838,Customer 26,France,FRA,Northern Europe,E,0.0,0.021079,0.0,0.022423,0.014117,0.011362,0.0,0.0,0.0,0.006941,0.014117,0.026173
17839,Customer 179,France,FRA,Northern Europe,F,0.0,0.030429,0.0,0.031626,0.019732,0.007305,0.0,0.0,0.0,0.01012,0.019732,0.036985
17840,Customer 466,France,FRA,Northern Europe,F,0.0,0.019698,0.0,0.019538,0.0,0.006372,0.009543,0.0,0.0,0.005355,0.009543,0.023379
17841,Customer 17,France,FRA,Northern Europe,G,0.0,0.035382,0.0,0.034993,0.0,0.011441,0.0,0.0,0.007481,0.009465,0.007481,0.041933
17842,Customer 60,France,FRA,Northern Europe,H,0.0,0.01936,0.0,0.019297,0.0,0.009319,0.0,0.0,0.004824,0.006501,0.004824,0.023175


## Objectives

In this notebook, you need to:

- Print the country name by region
- Compute the market size for the reference by:
    - Product
    - Industry
    - Region
- Industry Market size by product
- Region Market size by product
- Region Market size by industry

For all the above computation compute the market share of `reference`

- Remove Region-industry with no revenue
- Compute Euclidean Distance by Region-industry-product
- Filter the data by Region-industry-product

**the market share :Revenue to potential**

$$\text{Market share} = \sum  \text{Revenue Reference} / \sum \text{Total Market} $$

## Parameters

In [13]:
reference = [
    "A_reference",
    "B_reference",
    "C_reference",
    "D_reference",
    "E_reference",
]
market = [
    "A_market",
    "B_market",
    "C_market",
    "D_market",
    "E_market",
]

In [15]:
df_final[reference].head()

Unnamed: 0,A_reference,B_reference,C_reference,D_reference,E_reference
0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0


## List country by region

In [10]:
for c in df_final['Region'].sort_values().unique():

    countries = df_final[df_final['Region'].isin(
        [c])]['Country_name'].unique()

    print('\n Region name {} \n {}'.format(c, countries))


 Region name Americas 
 ['United States' 'Mexico' 'Venezuela' 'Brazil' 'Argentina' 'Canada' 'Peru'
 'Colombia' 'Uruguay' 'Chile' 'Ecuador' 'Paraguay' 'Trinidad and Tobago'
 'Bahamas' 'Jamaica' 'Panama' 'Armenia' 'Bolivia' 'Martinique' 'Guatemala'
 'Honduras']

 Region name Asia Pacific 
 ['Australia' 'new Zealand' 'Japan' 'China' 'India' 'Indonesia' 'Pakistan'
 'Singapore' 'Vietnam' 'Philippines' 'Hong Kong' 'Korea' 'Thailand'
 'Malaysia' 'Taiwan' 'Kazakhstan' 'Bangladesh' 'Macao' 'Sri Lanka'
 'Kiribati']

 Region name Central Europe 
 ['Germany' 'Czech' 'Hungary' 'Poland' 'Slovak Republic' 'Romania'
 'Lithuania' 'Estonia' 'Switzerland' 'Austria' 'The Russian Federation'
 'Slovenia' 'Bulgaria' 'Ukraine' 'Croatia' 'Latvia' 'Montenegro'
 'Federal Republic of Yugoslavia' 'Belarus' 'Georgia' 'Moldova']

 Region name Middle East & Africa 
 ['Ghana' 'South Africa' 'Tanzania' 'Kenya' 'Egypt' 'Mozambique' 'Congo'
 'Qatar' 'Lesotho' 'Saudi Arabia' 'Kuwait' 'Nigeria' 'Costa Rica' 'Mali'
 'Unite

# Top level

## Product Market size

In [29]:
(df_final[reference + market]
 .sum(axis = 0)
 .reset_index(name = 'sum_product')
 .assign(origin = lambda x: x['index'].str.extract(r"(_[^_]+$)"),
         product = lambda x: x['index'].str.extract(r"(^[^_]+(?=_))"))
 .drop(columns = 'index')
 .sort_values(by = "product")
 .set_index(['product', 'origin'])
 .unstack(level = 1)
 .assign(market_size = lambda x: x.iloc[:, 1] / x.iloc[:, 0])
 .sort_values(by = 'market_size')
 .style
 .bar(subset=['market_size'], color='#d65f5f')
 .format("{:.2%}", subset=['market_size'])
)

Unnamed: 0_level_0,sum_product,sum_product,market_size
origin,_market,_reference,Unnamed: 3_level_1
product,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
B,199.103,2.15314,1.08%
E,100.21,3.46428,3.46%
C,140.024,6.42883,4.59%
A,303.405,29.2571,9.64%
D,116.118,58.3886,50.28%


## Industry Market size

In [30]:
(df_final
 .groupby('industry').agg(
     {
         'Total_reference': 'sum',
         'Total_market': 'sum'})
 .assign(revenue_to_potential = lambda x:
         x['Total_reference']/ x['Total_market'])
 .sort_values(by = 'revenue_to_potential')
 .style
 .bar(subset=['revenue_to_potential'], color='#d65f5f')
 .format("{:.2%}", subset=['revenue_to_potential'])
          )

Unnamed: 0_level_0,Total_reference,Total_market,revenue_to_potential
industry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P,0.0,0.039431,0.00%
R,0.143765,2.72027,5.28%
J,4.27868,28.1474,15.20%
D,1.8764,11.846,15.84%
Q,0.0476717,0.297216,16.04%
E,12.5538,75.4826,16.63%
B,4.35818,24.7767,17.59%
C,6.19059,34.6936,17.84%
I,1.98264,11.0331,17.97%
A,11.7772,64.3312,18.31%


## Region Market size

In [31]:
(df_final
 .groupby('Region').agg(
     {
         'Total_reference': 'sum',
         'Total_market': 'sum'})
 .assign(revenue_to_potential = lambda x: x['Total_reference']/ x['Total_market'])
 .sort_values(by = 'revenue_to_potential')
 .style
 .format({'revenue_to_potential': "{:.2%}"})
 .bar(subset=['revenue_to_potential'], color='#d65f5f')
 )

Unnamed: 0_level_0,Total_reference,Total_market,revenue_to_potential
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Americas,0.637751,48.3736,1.32%
Asia Pacific,3.52503,68.3147,5.16%
Northern Europe,11.8137,99.4687,11.88%
Central Europe,18.3242,80.4991,22.76%
Middle East & Africa,10.7538,28.8115,37.32%
Southern Europe,22.4471,52.6215,42.66%


# Next level

## Industry Market size by product

In [None]:
(df_final[['industry'] + reference + market]
 .groupby('industry')
 .sum(axis = 0)
 .transpose()
 .reset_index()
 .assign(origin = lambda x: x['index'].str.extract(r"(_[^_]+$)"),
        product = lambda x: x['index'].str.extract(r"(^[^_]+(?=_))"))
 .drop(columns = 'index')
 .sort_values(by = "product")
 .set_index(['product', 'origin'])
 .groupby(level = 0)
 .apply(lambda x: x.shift(1) / x )
 .dropna()
 .droplevel('origin')
 .transpose()
 .assign(total_industry = lambda x: x.sum(1))
 .sort_values(by ='total_industry')
 .drop(columns = 'total_industry')
 .style
 .format("{:.2%}")
 .background_gradient(cmap=cm)
)

## Region Market size by industry

In [None]:
ttpotential_i = (df_final
 .groupby('industry').agg(
     {
         'Total_market': 'sum'})
            )
ttpotential_i.head()

In [None]:
revenue_ci = (df_final
 .groupby(['Region', 'industry']).agg(
     {
         'Total_reference': 'sum'}
 )
)

In [None]:
region = df_final['Region'].unique()

In [None]:
(
    revenue_ci.reset_index()
    .merge(ttpotential_i.reset_index(), on="industry", how="inner")
    .assign(
        revenue_to_potential=lambda x: x["Total_reference"] / x["Total_market"]
    )
    .sort_values(by=["Region", "revenue_to_potential"], ascending=[True, False])
    .drop(columns=["Total_reference", "Total_market"])
    .pivot(index="industry", columns="Region", values="revenue_to_potential")
    .assign(total_industry = lambda x: x.sum(1))
    .sort_values(by ='total_industry')
    .fillna(0)
    .style
    .highlight_null('red')
    .background_gradient(cmap=cm,
                      subset = region)
    .bar(subset=['total_industry'], color='#d65f5f')
)

## Correlation

In [None]:
df_final.corr()

## Euclidean Distance

## Definition Euclidean Distance

According to the Euclidean distance formula, the distance between two points in the plane with coordinates (x, y), and is given by:

$$\begin{aligned} d(\mathbf{p}, \mathbf{q})=d(\mathbf{q}, \mathbf{p}) &=\sqrt{\left(q_{1}-p_{1}\right)^{2}+\left(q_{2}-p_{2}\right)^{2}+\cdots+\left(q_{n}-p_{n}\right)^{2}} \\ &=\sqrt{\sum_{i=1}^{n}\left(q_{i}-p_{i}\right)^{2}} \end{aligned}$$

### Example 

Calculate the distance between two arrays.

We want to sum each column of the first array with all the columns of the second array

Take an example, we want to compute the Euclidean distance of this first array:
- [7.2, 3.6, 5.1, 2.5]: It has 4 columns and one row

we will compute the Euclidean distance with all the rows of the second array. The second array has also four columns. We are substracting each columns of the first and second array respecively 

Exemple with the first rows of the second array:
- [5.1, 3.5, 1.4, 0.2]

The Euclidean distance is $$\sqrt{((7.2 - 5.1)^2  + (3.6 - 3.5)^2 + (5.1 -1.4)^2 + (2.5 - 0.2)^2)}$$

We repeat for all the rows

## Data standardization

The value of distance measures is intimately related to the scale on which measurements are made.

In [None]:
def euclideanDistance(vect_1, vect_2):
  
    """
      Compute the Euclidean distance 
    """
  
    sum_ = np.sum(np.power(vect_1 - vect_2, 2))
  
    euclideanD = np.sqrt(sum_)
    return euclideanD

Remove country industry pair with no remove.It will inflate the distance

In [None]:
indu_norevenue = (df_final
 .groupby(['Region', 'industry'])
 .agg(
     {
         'Total_reference': 'sum'}
 )
 .loc[lambda x: x['Total_reference'] == 0]
)
indu_norevenue

In [None]:
list_ = list(indu_norevenue.index)
list_[0][0]

In [None]:
index_no = [] 
for c in  list_:
    index_ =  df_final[(df_final['Region'] == c[0]) &
          (df_final['industry'] == c[1])
          ].index
    index_no.extend(index_)

In [None]:
X = df_final[~df_final.index.isin(index_no)]

In [None]:
def compute_potential(df, normalized=True, begin=0, end=10):
    """
    The fonction compute the Euclidean Distance
    Merge the whitespace 
    and plot the results: selected points and whitespace
    """

    # Normal

    # Reshape the data
    region_indu_prod = (df[['Region', 'industry'] + reference + market]
                        .groupby(['Region', 'industry'])
                        .sum(axis=0)
                        .stack()
                        .reset_index(name='sum')
                        .assign(origin=lambda x: x['level_2'].str.extract(
                            r"(_[^_]+$)"),
                                product=lambda x: x['level_2'].str.extract(
                                    r"(^[^_]+(?=_))"))
                        .drop(columns='level_2')
                        .set_index(['Region', 'industry', 'product', 'origin'
                                    ])
                        .unstack()
                        .assign(norm_pot=lambda x:  (x.iloc[:, 0] -
                                                     x.iloc[:, 0].mean())/
                                x.iloc[:, 0].std(),
                                norm_vod=lambda x:  (
                                x.iloc[:, 1] - x.iloc[:, 1].mean())/
                                x.iloc[:, 1].std(),
                                )
                        )

    # Create distance
    if normalized:
        x_ = 2
        y_ = 3
    else:
        x_ = 0
        y_ = 1

    region_indu_prod["distance"] = region_indu_prod.groupby(
        #['Inbound_Region', 'str']
        level=[0, 1]
    ).apply(lambda x: euclideanDistance(x.iloc[:, x_],
                                        x.iloc[:, y_],
                                        )
            )

    ### Compute whitesaoce
    whitespace_ri = (
        region_indu_prod
        .iloc[:, :2]
        .droplevel('product')
        .groupby(level=[0, 1])
        .sum()
        .assign(whitespace=lambda x: x.iloc[:, 0] - x.iloc[:, 1])
        .iloc[:, -1]
    )

    sort_distance = (region_indu_prod
                     .droplevel(level=[2])
                     .iloc[:, -1]
                     .drop_duplicates()
                     )
    df_Epotential = pd.concat(
        [sort_distance, whitespace_ri], axis=1).sort_values(by='distance')
    
    title = 'Top {} markets: Whitespace according to \
    Euclidean Distance'.format(end - begin)

    df_Epotential.iloc[begin:end, 1].plot.barh(
        title=title)

    return region_indu_prod, df_Epotential

In [None]:
def filter_distance(df, country, industry):
    """
    filter the multi-index dataframe
    and return country industry
    """
    return df.loc[(country,industry)]

In [None]:
df_Epotential = compute_potential(
    df = X,
    normalized =  False,
                  begin = 80,
                  end = 92)

In [None]:
filter_distance(df =df_Epotential[0],
                country = 'Americas',
                industry = 'E')