In [1]:
import os
import re
import html
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

from matching_helpers import normaliser,\
                             parse_datetime,\
                             read_raw_data,\
                             prepare_nhsspend,\
                             prepare_contractsfinder,\
                             org_counter,\
                             strip_html,\
                             unique_agg

tqdm.pandas()
df_centgov = read_raw_data('centgov_data.csv', 'Contracts Finder')
df_nhs = read_raw_data('nhsspend_data.csv', 'NHSSpend')
df_contracts = read_raw_data('contractsfinder_data.csv', 'Contracts Finder')

In [2]:
df_centgov = df_centgov[['data_source', 'amount', 'supplier', 'date', 'dept']]

In [3]:
df_nhs = prepare_nhsspend(df_nhs)

In [4]:
df_contracts = prepare_contractsfinder(df_contracts)

In [5]:
df_comb = pd.concat([df_nhs, df_centgov, df_contracts], ignore_index=True)
df_comb = df_comb.rename({'supplier': 'SUPPLIER'}, axis=1)
df_comb['SUPPLIER'] = df_comb['SUPPLIER'].str.upper().str.strip()

df_comb['SUPPLIER_NUMERIC'] = pd.to_numeric(df_comb['SUPPLIER'], errors='coerce')
print(f'Dropping {len(df_comb[df_comb["SUPPLIER"].isnull()])} rows of data because of numeric suppliers')
df_comb = df_comb[df_comb['SUPPLIER'].notnull()]
df_comb = df_comb[df_comb['SUPPLIER_NUMERIC'].isna()]
df_comb = df_comb.drop(columns='SUPPLIER_NUMERIC')

print(f'Dropping {len(df_comb[df_comb["SUPPLIER"].isnull()])} rows of data because of NaN suppliers')
df_comb = df_comb[df_comb['SUPPLIER'].notnull()]
df_comb['SUPPLIER'] = df_comb['SUPPLIER'].progress_apply(strip_html)
print(f'Dropping {len(df_comb[df_comb["SUPPLIER"].isnull()])} rows of data after html parsing')
df_comb = df_comb[df_comb['SUPPLIER'].notnull()]
df_comb['date'] = df_comb['date'].astype(str).str.split('T').str[0]
df_comb['date'] = pd.to_datetime(df_comb['date'],
                                 format='mixed',
                                 errors='coerce')
df_comb['date'] = df_comb['date'].map(lambda x: x.strftime('%d-%m-%Y') if pd.notnull(x) else np.nan)
print(f'Dropping {len(df_comb[df_comb["date"].isnull()])} rows of data due to NaN dates')
df_comb = df_comb[df_comb['date'].notnull()]
print(f'Dropping {len(df_comb[df_comb["amount"].isnull()])} rows of data due to NaN amounts')
df_comb = df_comb[df_comb['amount'].notnull()]
print(f'Dropping {len(df_comb[df_comb["dept"].isnull()])} rows of data due to NaN depts')
df_comb = df_comb[df_comb['dept'].notnull()]
df_comb['NORMALIZED_SUPPLIER'] = df_comb['SUPPLIER'].progress_apply(normaliser)

rows_to_drop = len(df_comb[
    (df_comb["SUPPLIER"].str.len() <= 3) |
    (df_comb["NORMALIZED_SUPPLIER"].str.len() <= 3)
])

# Print the message with the count of rows to be dropped
print(f'Dropping {rows_to_drop} rows of data due to supplier str len<=3')

df_comb = df_comb[
    (df_comb["SUPPLIER"].str.len() > 3) |
    (df_comb["NORMALIZED_SUPPLIER"].str.len() > 3)
]

df_comb[['SUPPLIER', 'ORG_COUNT']] = df_comb['SUPPLIER'].apply(lambda x: pd.Series(org_counter(x)))

all_rows = len(df_comb)

for supplier in ["SUCCESSFUL SUPPL",
                 "SEE ATTACH",
                 "REFER ATTACH",
                 "CONTRACT WAS AWARD",
                 "AWARDED SUPPLIERS",
                 "SUCCESSFUL SUPPLIER",
                 "PLEASE SEE",
                 'NAMED IND',
                 'REDACT',
                 "PLEASE REFER"]:
    df_comb = df_comb[~df_comb['SUPPLIER'].str.contains(supplier)]

print(f'Number of rows dropped due to redacted: {len(df_comb)-all_rows}')

print(f'Dropping {len(df_comb[df_comb["ORG_COUNT"]!=1])} where org_count !=1')
df_comb = df_comb[df_comb['ORG_COUNT']==1]

df_comb.to_csv(os.path.join(os.getcwd(),
                            '..',
                            'raw_data',
                            'merged_singlesuppliers_raw.csv')
              )

Dropping 926 rows of data because of numeric suppliers
Dropping 0 rows of data because of NaN suppliers


  0%|          | 0/9331286 [00:00<?, ?it/s]

Dropping 0 rows of data after html parsing
Dropping 14846 rows of data due to NaN dates
Dropping 0 rows of data due to NaN amounts
Dropping 7 rows of data due to NaN depts


  0%|          | 0/9316433 [00:00<?, ?it/s]

Dropping 2750 rows of data due to supplier str len<=3
Number of rows dropped due to redacted: -383
Dropping 69239 where org_count !=1


In [7]:
df_uniq = df_comb.pivot_table(index=['SUPPLIER'],
                              values=['date',
                                      'dept'],
                              aggfunc=unique_agg).reset_index()
df_sum = df_comb.groupby('SUPPLIER')['amount'].sum().reset_index()
df_counts = df_comb['SUPPLIER'].value_counts().reset_index()
df_uniq = pd.merge(df_uniq,
                   df_sum,
                   how='left',
                   left_on='SUPPLIER',
                   right_on='SUPPLIER'
                  )
df_uniq[['SUPPLIER', 'ORG_COUNT']] = df_uniq['SUPPLIER'].progress_apply(lambda x: pd.Series(org_counter(x)))
df_uniq['NORMALIZED_SUPPLIER'] = df_uniq['SUPPLIER'].progress_apply(normaliser)
df_uniq = pd.merge(df_uniq,
                   df_counts,
                   how='left',
                   left_on='SUPPLIER',
                   right_on='SUPPLIER'
                  )
df_uniq.sort_values(by=['amount'],
                    ascending=False)

df_uniq1 = df_nhs[['supplier',
                   'NHSSpend_CompanyName',
                   'NHSSpend_CompanyNumber',
                   'NHSSpend_CharityName',
                   'NHSSpend_CharityRegNo',
                   'NHSSpend_CharitySubNo',
                   'NHSSpend_CharityNameNo',
                   'NHSSpend_CharityName']].drop_duplicates()
df_uniq = pd.merge(df_uniq,
                   df_uniq1,
                   how='left',
                   left_on='SUPPLIER',
                   right_on='supplier'
                  )
df_uniq = df_uniq.rename({'count': 'PAYMENT_TOTAL_COUNT',
                          'amount': 'PAYMENT_TOTAL_AMOUNT'},
                         axis=1)
print(f'Dropping {len(df_uniq[df_uniq["ORG_COUNT"]!=1])} org_count !=1')
df_uniq = df_uniq[df_uniq['ORG_COUNT']==1]
df_uniq = df_uniq.drop(columns='supplier')

  0%|          | 0/213632 [00:00<?, ?it/s]

  0%|          | 0/213632 [00:00<?, ?it/s]

Dropping 0 org_count !=1


In [8]:
df_uniq = df_uniq.sort_values(by='PAYMENT_TOTAL_AMOUNT',
                              ascending=False)
df_uniq.to_csv(os.path.join(os.getcwd(),
                            '..',
                            'raw_data',
                            'merged_groupby_singlesuppliers_raw.csv'),
               index=False)

In [9]:
print(f'We are then left with {len(df_uniq)} rows of unique "single" suppliers')
print(f'We are then left with {len(df_comb)} rows of unique "single" payments')

We are then left with 213632 rows of unique "single" suppliers
We are then left with 9244594 rows of unique "single" payments
