In [52]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import col, row_number, datediff, current_date, round


spark = SparkSession.builder.master('local[2]').getOrCreate()

In [1]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import col, row_number, datediff, current_date, round


spark = (
    SparkSession.builder
    .master('spark://192.168.68.68:7077')
    .config("spark.executor.instances", 1)
    .config("spark.executor.cores", "2")
    .config("spark.executor.memory", "2G")
    .appName('CompaniesHouse')
    .getOrCreate()
)
sc = spark.sparkContext

In [2]:
companies = spark.read.option('delimiter', ';').option('header', 'true').csv('./corporate_uk/companies.csv')\
    .withColumn('current_assets', col('current_assets').cast('double'))\
    .withColumn('average_number_employees_during_period', col('average_number_employees_during_period').cast('double'))
sic_codes = spark.read.option('delimiter', ';').option('header', 'true').csv('./corporate_uk/companies_sic_codes.csv')
filings = spark.read.option('delimiter', ';').option('header', 'true').csv('./corporate_uk/filings.csv')\
    .withColumn('pages', col('pages').cast('int'))
owners = spark.read.option('delimiter', ';').option('header', 'true').csv('./corporate_uk/officers_and_owners.csv')
companies.printSchema()
sic_codes.printSchema()
filings.printSchema()
owners.printSchema()

root
 |-- company_number: string (nullable = true)
 |-- company_type: string (nullable = true)
 |-- office_address: string (nullable = true)
 |-- incorporation_date: string (nullable = true)
 |-- jurisdiction: string (nullable = true)
 |-- company_status: string (nullable = true)
 |-- account_type: string (nullable = true)
 |-- company_name: string (nullable = true)
 |-- sic_codes: string (nullable = true)
 |-- date_of_cessation: string (nullable = true)
 |-- next_accounts_overdue: string (nullable = true)
 |-- confirmation_statement_overdue: string (nullable = true)
 |-- owners: string (nullable = true)
 |-- officers: string (nullable = true)
 |-- average_number_employees_during_period: double (nullable = true)
 |-- current_assets: double (nullable = true)
 |-- last_accounts_period_end: string (nullable = true)
 |-- company_url: string (nullable = true)

root
 |-- company_number: string (nullable = true)
 |-- sic_code: string (nullable = true)
 |-- sic_description: string (nullable = 

In [6]:
companies.groupBy('company_status')\
    .count()\
    .sort('count', ascending=False)\
    .show(truncate=False)

+----------------------+-------+
|company_status        |count  |
+----------------------+-------+
|Active                |4744110|
|Dissolved             |534818 |
|Liquidation           |110597 |
|Registered            |29219  |
|In Administration     |3612   |
|Receiver Action       |2683   |
|Open                  |1075   |
|Removed               |1035   |
|Converted / Closed    |591    |
|Voluntary Arrangement |513    |
|Insolvency Proceedings|345    |
|Closed                |302    |
+----------------------+-------+



In [4]:
w = Window.partitionBy(['jurisdiction']).orderBy(col('count').desc())
companies.groupBy(['company_type', 'jurisdiction'])\
            .count()\
            .withColumn('rnk', row_number().over(w))\
            .where(col('rnk') == 1)\
            .drop(col('rnk'))\
            .show(truncate=False)

+------------------------+------------------+-------+
|company_type            |jurisdiction      |count  |
+------------------------+------------------+-------+
|UK establishment company|NULL              |1377   |
|Private limited company |England/Wales     |4719455|
|Private limited company |European Union    |1      |
|Overseas entity         |Foreign (Non E.U.)|1      |
|Private limited company |Northern Ireland  |79908  |
|Private limited company |Scotland          |248863 |
|Overseas entity         |United Kingdom    |30252  |
|Private limited company |Wales             |32037  |
+------------------------+------------------+-------+



In [None]:
sic_codes.groupBy(['sic_section', 'sic_division'])\
    .count()\
    .sort(col('count').desc())\
    .limit(10)\
    .show(truncate=False)

+--------------------------------------------------------------------+---------------------------------------------------------------------------+------+
|sic_section                                                         |sic_division                                                               |count |
+--------------------------------------------------------------------+---------------------------------------------------------------------------+------+
|Real estate activities                                              |Real estate activities                                                     |831302|
|Wholesale and retail trade; repair of motor vehicles and motorcycles|Retail trade, except of motor vehicles and motorcycles                     |690760|
|Construction                                                        |Specialised construction activities                                        |413583|
|Information and communication                                       |Comput

In [None]:
companies.withColumn('age', datediff(current_date(), col('incorporation_date')) / 365.25)\
.select([
    'company_type', 
    'age'])\
.groupBy('company_type')\
.avg()\
.withColumn('average_age', round(col('avg(age)'), 2))\
.drop(col('avg(age)'))\
.sort(col('average_age'))\
.show(truncate=False)

+--------------------------------------------------------------------------------------+-----------+
|company_type                                                                          |average_age|
+--------------------------------------------------------------------------------------+-----------+
|Overseas entity                                                                       |1.8        |
|Further education or sixth form college corporation                                   |4.14       |
|Protected cell company                                                                |4.75       |
|Scottish qualifying partnership                                                       |5.51       |
|UK establishment company                                                              |7.11       |
|Limited liability partnership                                                         |9.24       |
|Private limited company                                                               |9.3

In [8]:
companies.filter((companies.next_accounts_overdue == True) | (companies.confirmation_statement_overdue == True))\
    .select(col('company_name'))\
    .distinct()\
    .show()

+--------------------+
|        company_name|
+--------------------+
|  Jensen Telecom Ltd|
| Barrow Boys Limited|
|Seagull Internati...|
|  Deborah Coates Ltd|
|2mc Holdings Limited|
|Sunny Sandcastles...|
|     Uneed2relax Ltd|
|Serhat Wholesale ...|
|         Demodem Ltd|
|Florin Contracts ...|
|Cullingford Carpe...|
|Woodstock Product...|
|Waves Hotels Grou...|
|  Shollwaves Limited|
|Industrialize Uk Ltd|
|Whole Recruitment...|
|  Bertea Ipswich Ltd|
|Focus Industry An...|
|Mns Retail Interi...|
|       Sdhac Limited|
+--------------------+
only showing top 20 rows



In [None]:
companies.groupBy('account_type')\
    .count()\
    .withColumn('percentage', round(100 * col('count') / companies.count(), 3))\
    .sort('percentage', ascending=False)\
    .show(truncate=False)

+---------------------------+-------+----------+
|account_type               |count  |percentage|
+---------------------------+-------+----------+
|Micro Entity               |1843452|33.956    |
|Total Exemption Full       |1346638|24.805    |
|Dormant                    |742074 |13.669    |
|Null                       |717664 |13.219    |
|NULL                       |365376 |6.73      |
|Unaudited abridged         |175651 |3.235     |
|Full                       |96017  |1.769     |
|Small                      |72571  |1.337     |
|Group                      |27025  |0.498     |
|Audit Exemption Subsidiary |25788  |0.475     |
|Total Exemption Small      |12266  |0.226     |
|Medium                     |1451   |0.027     |
|Audited abridged           |1437   |0.026     |
|No accounts type available |928    |0.017     |
|Filing Exemption Subsidiary|367    |0.007     |
|Interim                    |162    |0.003     |
|Partial Exemption          |29     |0.001     |
|Initial            

In [29]:
companies.join(sic_codes, 'company_number', how='left')\
    .select([
        companies.company_number, 
        sic_codes.sic_code, 
        sic_codes.sic_description,
        companies.current_assets
    ])\
    .groupBy('sic_description')\
    .avg('current_assets')\
    .sort('avg(current_assets)', ascending=False)\
    .show(truncate=False)

+--------------------------------------------------------------------------------------------------+--------------------+
|sic_description                                                                                   |avg(current_assets) |
+--------------------------------------------------------------------------------------------------+--------------------+
|Banks                                                                                             |1.6769202507543105E8|
|Financial leasing                                                                                 |1.1421620102743903E8|
|Security dealing on own account                                                                   |3.2738479891555935E7|
|Activities of mortgage finance companies                                                          |2.472619615787771E7 |
|Treatment and disposal of hazardous waste                                                         |1.643513333625731E7 |
|Factoring              

In [39]:
companies.groupBy('company_type')\
    .avg('average_number_employees_during_period')\
    .withColumn('avg_employees', col('avg(average_number_employees_during_period)').cast('int'))\
    .drop('avg(average_number_employees_during_period)')\
    .show(truncate=False)

+--------------------------------------------------------------------------------------+-------------+
|company_type                                                                          |avg_employees|
+--------------------------------------------------------------------------------------+-------------+
|Converted / closed                                                                    |2            |
|Private unlimited company                                                             |11           |
|Public limited company                                                                |49           |
|Private limited company                                                               |206          |
|Private limited by guarantee without share capital                                    |251          |
|Private Limited Company by guarantee without share capital, use of 'Limited' exemption|21           |
|Private unlimited company without share capital                         

In [11]:
owners.filter(owners.is_owner == True)\
    .groupby(['company_country'])\
    .count()\
    .sort('count', ascending=False)\
    .show()


+--------------------+-------+
|     company_country|  count|
+--------------------+-------+
|                NULL|3908976|
|      united kingdom| 173234|
|            scotland|   3912|
|              jersey|   1891|
|       united states|   1799|
|virgin islands br...|   1548|
|            guernsey|   1452|
|         isle of man|    774|
|           hong kong|    717|
|               wales|    715|
|         netherlands|    694|
|    northern ireland|    694|
|          seychelles|    633|
|             ireland|    584|
|               china|    557|
|           gibraltar|    413|
|             germany|    391|
|    marshall islands|    369|
|              france|    359|
|              norway|    345|
+--------------------+-------+
only showing top 20 rows



In [17]:
owners.join(companies, owners.company_number == companies.company_number, how='left')\
    .groupby([companies.company_type, owners.officer_role])\
    .count()\
    .sort('count', ascending=False)\
    .show(truncate=False)

+--------------------+--------------------+-------+
|        company_type|        officer_role|  count|
+--------------------+--------------------+-------+
|Private limited c...|            Director|7305028|
|Private limited c...|                NULL|1283809|
|Private limited c...|           Secretary|1014104|
|Private limited b...|            Director| 376686|
|Private Limited C...|            Director| 138571|
|Limited liability...|LLP Designated Me...| 115722|
|Private limited b...|           Secretary|  53513|
|     Overseas entity|                NULL|  40102|
|    Overseas company|            Director|  34238|
|Limited liability...|          LLP Member|  25192|
|Private limited b...|                NULL|  24435|
| Limited partnership|                NULL|  21099|
|Public limited co...|            Director|  15570|
|Private Limited C...|           Secretary|  13437|
|Limited liability...|                NULL|  11552|
|    Overseas company|           Secretary|   8749|
|Private unl

In [21]:
owners.groupBy('company_number')\
    .count()\
    .join(companies, 'company_number', how='left')\
    .select([
        'company_name',
        'count'
    ])\
    .sort('count', ascending=False)\
    .show(truncate=False)

+--------------------------------------------+-----+
|company_name                                |count|
+--------------------------------------------+-----+
|Stadco Limited                              |5    |
|Newcastle Upon Tyne Law Society             |5    |
|Peter Maurice Music Company Limited(the)    |5    |
|Public Picture Gallery Fund Birmingham      |5    |
|Sheffield Masonic Hall Company Limited.(the)|5    |
|Kentstone Properties Limited                |5    |
|Bristol Street Fourth Investments Limited   |5    |
|Ashford Cattle Market Company Limited(the)  |5    |
|Albert Bowling Club Limited                 |5    |
|Craven Dunnill & Co.limited                 |5    |
|Chemetall Limited                           |5    |
|Gresham House Limited                       |5    |
|Iowa Land Company Limited                   |5    |
|Methodist Insurance Public Limited Company  |5    |
|Eastern Counties Laundries Limited          |5    |
|Lloyds Bank Plc                             |

In [26]:
filings.groupBy([
    'category',
    'description'
    ])\
    .count()\
    .sort('count', ascending=False)\
    .show(truncate=False)

+-------------+---------------------------------------------------+-------+
|category     |description                                        |count  |
+-------------+---------------------------------------------------+-------+
|incorporation|Incorporation                                      |1484882|
|gazette      |First Gazette notice for compulsory strike-off     |402049 |
|gazette      |Compulsory strike-off action has been discontinued |326098 |
|accounts     |Micro company accounts made up to 2023-03-31       |280815 |
|accounts     |Micro company accounts made up to 2022-03-31       |259387 |
|accounts     |Total exemption full accounts made up to 2023-03-31|255485 |
|accounts     |Total exemption full accounts made up to 2022-03-31|237444 |
|dissolution  |Compulsory strike-off action has been suspended    |173529 |
|accounts     |Total exemption full accounts made up to 2022-12-31|151673 |
|accounts     |Micro company accounts made up to 2022-12-31       |145807 |
|accounts   

In [34]:
filings.groupBy('company_number')\
    .sum('pages')\
    .join(companies, 'company_number', how='left')\
    .select([
        'company_name',
        'sum(pages)'
    ])\
    .sort('sum(pages)', ascending=False)\
    .show(truncate=False)

+------------------------------------------------------------+----------+
|company_name                                                |sum(pages)|
+------------------------------------------------------------+----------+
|Mediobanca - Banca Di Credito Finanziario Societa Per Azioni|4737      |
|Intesa Sanpaolo S.p.a                                       |4406      |
|Obrascon Huarte Lain S.a.                                   |3105      |
|NULL                                                        |2104      |
|Bayerische Landesbank                                       |2030      |
|International Consolidated Airlines Group S.a.              |1950      |
|Banco Comercial Portugues S.a.                              |1707      |
|Banco Santander S.a.                                        |1679      |
|Spw Investments Limited                                     |1619      |
|Scottishpower Overseas Holdings Limited                     |1610      |
|NULL                                 

In [55]:
incorporation_files = filings.join(companies, 'company_number', how='left')\
    .select(['company_name', 'date', 'incorporation_date'])\
    .withColumn('flag_incorporation_file', (col('date') == col('incorporation_date')).cast('int'))\
    .groupBy()\
    .sum('flag_incorporation_file').collect()[0][0]
incorporation_files / filings.count()

0.07061160440373027

In [56]:
spark.stop()