In [1]:
import sys
sys.path.append('../')

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from utils.plot_utils import export_plot


In [2]:
competitors = pd.read_parquet('../data/raw/competitors.parquet.gzip')
parents = pd.read_parquet('../data/raw/parents.parquet.gzip')
partners = pd.read_parquet('../data/raw/partners.parquet.gzip')
suppliers = pd.read_parquet('../data/raw/suppliers.parquet.gzip')
info = pd.read_parquet('../data/raw/entity_info.parquet.gzip')
address = pd.read_parquet('../data/raw/addresses.parquet.gzip')
companies = pd.read_csv('../data/processed/companies.csv')

## Sector plot

I see that it is wrong to use the info table to find the companies in a sector as there are more companies in the info table as compared to the companies in the Neo4j database and this is what is causing the discrepancy between the two counts. Hence for the thesis it is deemed better to use the count using the neo4j cypher queries rather than the info table.

In [3]:
df = info.groupby(['sector']).count().reset_index().sort_values('code',ascending=False)
# df.loc[df['code']<7000, 'sector']= 'Other'
fig = px.pie(df, values='code', names='sector', title='Sector distribution', height=500, width=700)
fig.show()
# export_plot(fig, 'sector_distribution', ['html','png','svg'])

In [4]:
# Check if there are companies belonging in 2 industries
for a in info.loc[info.id.duplicated(),'id']:
    data = info.loc[info.id == a].reset_index()
    if data.iloc[0].industry != data.iloc[1].industry:
        print(data)

In [5]:
a = set(companies.id)

In [6]:
info['id']

0          93528
1          72520
2          64667
3         278227
4          72518
           ...  
236940    162438
236941      9627
236942     40206
236943     12488
236944     91784
Name: id, Length: 236945, dtype: int64

In [7]:
b = set(info['id'])

In [8]:
len(b-a)

80233

In [9]:
t = info.groupby(['sector'])['id'].agg('unique')['Finance']

## Bar plot of Start years in suppliers relationship

In [10]:
suppliers.start_date = pd.to_datetime(suppliers.start_date)
supplier_relation_start_counts = suppliers.start_date.dt.year.value_counts().reset_index()
supplier_relation_start_counts.sort_values('index', inplace=True)

In [11]:
fig = px.bar(supplier_relation_start_counts, x='index', y='start_date',  height=500, width=700)

fig.update_layout(
    title='Number of supplier relations per year',
    xaxis_title='Year',
    yaxis_title='Number of supplier relations',
)

fig.show()

# export_plot(fig, 'supplier_relation_start_counts', ['html', 'png', 'svg'])

## Bar plot of Start years in Partners relationship

In [12]:
partners.start_date = pd.to_datetime(partners.start_date)
partner_relation_start_counts = partners.start_date.dt.year.value_counts().reset_index()
partner_relation_start_counts.sort_values('index', inplace=True)

In [13]:
fig = px.bar(partner_relation_start_counts, x='index', y='start_date',  height=500, width=700)

fig.update_layout(
    title='Number of partner relations per year',
    xaxis_title='Year',
    yaxis_title='Number of partner relations',
)

fig.show()

# export_plot(fig, 'partner_relation_start_counts', ['html', 'png', 'svg'])

## Bar plot of Start years in competitors relationship

In [14]:
competitors.start_date = pd.to_datetime(competitors.start_date)
competitor_relation_start_counts = competitors.start_date.dt.year.value_counts().reset_index()
competitor_relation_start_counts.sort_values('index', inplace=True)

In [15]:
fig = px.bar(competitor_relation_start_counts, x='index', y='start_date',  height=500, width=700)

fig.update_layout(
    title='Number of competitors relations per year',
    xaxis_title='Year',
    yaxis_title='Number of competitor relations',
)

fig.show()
# export_plot(fig, 'competitor_relation_start_counts', ['html', 'png', 'svg'])

## Plot of number of companies in countries

In [16]:
availability = address.groupby(['country']).count().sort_values(by='name', ascending=False).reset_index()

In [17]:
availability

Unnamed: 0,country,id,name,city_state_postal,location_street1
0,USA,48514,48514,43972,42607
1,CHN,19894,19894,14127,12776
2,JPN,11781,11781,8659,8318
3,GBR,11312,11312,10177,9979
4,DEU,7357,7357,6580,6496
...,...,...,...,...,...
204,GUF,1,1,1,1
205,ERI,1,1,0,0
206,ASM,1,1,0,0
207,VAT,1,1,1,0


In [19]:
fig = px.choropleth(availability, locations="country",
                    color="name", # lifeExp is a column of gapminder
                    hover_name="country", # column to add to hover information
                    color_continuous_scale=px.colors.sequential.Plasma,
                    height=500, width=1000,)

fig.update_layout(
    coloraxis_colorbar=dict(
    title="Number of companies per country",
    len=1
    ),
    title_text = 'Availability of company data based on country'
)
# fig.show()

export_plot(fig, 'availability', ['html', 'png', 'svg'])

'Exported'