In [1]:
# companies & ceos stats 
#
# 5 July 2024
# marieke.van.erp@dh.huc.knaw.nl

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import matplotlib.patches as mpatches

In [2]:
company_data = pd.read_csv('company_acronyms_dates.tsv', sep='\t').drop('Unnamed: 0',axis=1).drop_duplicates()
company_data['occurrence_companies'] = company_data.groupby('company.value')['company.value'].transform('size')
company_data

Unnamed: 0,acronymId.value,starttime.value,company.value,companyLabel.xml:lang,companyLabel.value,acronym.xml:lang,acronym.value,endtime.value,occurrence_companies
0,http://www.wikidata.org/entity/statement/Q1147...,2021-01-01T00:00:00Z,http://www.wikidata.org/entity/Q114794,en,Radiotelevisione svizzera di lingua italiana,de,rti,,4
1,http://www.wikidata.org/entity/statement/Q3919...,2002-10-07T00:00:00Z,http://www.wikidata.org/entity/Q3919666,en,BSB Bank,en,UBS Bank JSC,2003-12-23T00:00:00Z,9
2,http://www.wikidata.org/entity/statement/Q3919...,2002-10-07T00:00:00Z,http://www.wikidata.org/entity/Q3919666,en,BSB Bank,be,ЗАТ УБС Банк,2003-12-23T00:00:00Z,9
3,http://www.wikidata.org/entity/statement/Q3919...,2002-10-07T00:00:00Z,http://www.wikidata.org/entity/Q3919666,en,BSB Bank,ru,ЗАО УБС Банк,2003-12-23T00:00:00Z,9
4,http://www.wikidata.org/entity/statement/Q2793...,2016-01-01T00:00:00Z,http://www.wikidata.org/entity/Q2793376,en,Iptor Supply Chain Systems,en,Iptor,,2
...,...,...,...,...,...,...,...,...,...
1716,http://www.wikidata.org/entity/statement/Q9811...,,http://www.wikidata.org/entity/Q98115297,en,M5Stack,en,M5Stack,,1
1718,http://www.wikidata.org/entity/statement/Q9994...,,http://www.wikidata.org/entity/Q99941065,en,Niederschlesische Bergbau,de,Niebag,,1
1719,http://www.wikidata.org/entity/statement/Q1022...,,http://www.wikidata.org/entity/Q102286981,en,Brain Labs,en,X23,,1
1721,http://www.wikidata.org/entity/statement/Q1054...,,http://www.wikidata.org/entity/Q105467105,en,XXXLutz Lakberendezési Kft.,hu,XXXLutz Kft.,,1


In [3]:
# Convert datetime to date  
company_data = company_data.assign(**company_data[['starttime.value']].apply(pd.to_datetime, yearfirst=True, errors='coerce'))
company_data['starttime_date'] = company_data['starttime.value'].dt.date

company_data = company_data.assign(**company_data[['endtime.value']].apply(pd.to_datetime, yearfirst=True, errors='coerce'))
company_data['endtime_date'] = company_data['endtime.value'].dt.date

company_data.head()

Unnamed: 0,acronymId.value,starttime.value,company.value,companyLabel.xml:lang,companyLabel.value,acronym.xml:lang,acronym.value,endtime.value,occurrence_companies,starttime_date,endtime_date
0,http://www.wikidata.org/entity/statement/Q1147...,2021-01-01 00:00:00+00:00,http://www.wikidata.org/entity/Q114794,en,Radiotelevisione svizzera di lingua italiana,de,rti,NaT,4,2021-01-01,NaT
1,http://www.wikidata.org/entity/statement/Q3919...,2002-10-07 00:00:00+00:00,http://www.wikidata.org/entity/Q3919666,en,BSB Bank,en,UBS Bank JSC,2003-12-23 00:00:00+00:00,9,2002-10-07,2003-12-23
2,http://www.wikidata.org/entity/statement/Q3919...,2002-10-07 00:00:00+00:00,http://www.wikidata.org/entity/Q3919666,en,BSB Bank,be,ЗАТ УБС Банк,2003-12-23 00:00:00+00:00,9,2002-10-07,2003-12-23
3,http://www.wikidata.org/entity/statement/Q3919...,2002-10-07 00:00:00+00:00,http://www.wikidata.org/entity/Q3919666,en,BSB Bank,ru,ЗАО УБС Банк,2003-12-23 00:00:00+00:00,9,2002-10-07,2003-12-23
4,http://www.wikidata.org/entity/statement/Q2793...,2016-01-01 00:00:00+00:00,http://www.wikidata.org/entity/Q2793376,en,Iptor Supply Chain Systems,en,Iptor,NaT,2,2016-01-01,NaT


In [4]:
# General stats of the dataset 
# Unique number of values 
company_data.nunique()

acronymId.value          1449
starttime.value            43
company.value            1247
companyLabel.xml:lang       1
companyLabel.value       1247
acronym.xml:lang           55
acronym.value            1398
endtime.value              32
occurrence_companies        7
starttime_date             43
endtime_date               32
dtype: int64

In [5]:
company_data['occurrence_companies'].std()

1.008901582505205

In [6]:
company_data['occurrence_companies'].min()

1

In [7]:
company_data['occurrence_companies'].max()

9

In [8]:
company_data['occurrence_companies'].mean()

1.4154589371980677

In [9]:
company_data['occurrence_companies'].median()

1.0