In [51]:
import pandas as pd
import numpy as np

In [52]:
PATH = 'ubs-lauzhack-2024/'
account_booking_train = pd.read_csv(PATH + 'account_booking_train.csv')
external_parties_train = pd.read_csv(PATH + 'external_parties_train.csv')

In [53]:
account_booking_train.head()

Unnamed: 0,transaction_reference_id,debit_credit_indicator,account_id,transaction_amount,transaction_currency,transaction_date
0,0ace8fca6ada96883ef2e823b5dea26b,CREDIT,25110,5249.26,GBP,2023-05-15
1,d52c4f1a546f5d784ee46a8f347ad607,DEBIT,27293,4481.5,GBP,2023-02-27
2,dac45362e7471a7fa2726c81adae8534,DEBIT,23088,1347.27,GBP,2023-03-18
3,2ee574398cd6c4a7f3e111447141550e,DEBIT,21641,9276.56,GBP,2023-07-15
4,c5154ea99a0ff84ba8e72217d34d3397,CREDIT,24233,8002.28,GBP,2024-01-12


In [54]:
external_parties_train.head()

Unnamed: 0,transaction_reference_id,party_role,party_info_unstructured,parsed_name,parsed_address_street_name,parsed_address_street_number,parsed_address_unit,parsed_address_postal_code,parsed_address_city,parsed_address_state,parsed_address_country,party_iban,party_phone,external_id
0,04ff0d1c680189e3a80c92d86407f0f5,BENE,mary mith 107 107 angela brooks n. thomasfurt ...,mary mith,angela brooks,107 107,,,n. thomasfurt,,bulgaria,GB49MYOB82127728573340,+1.815660-6791x8486,50039037
1,439ab0ad7380e6135ab2ff3fddd4a727,ORG,yesneia kim north michael 93971 koribati,yesneia kim,north michael,,,93971,koribati,,,,0 (269)620-8734x2349,60044692
2,00cac12d41191a84f9e31aa731a83512,ORG,w. roberson jr. 41010 rachel crossingapt. 923 ...,w. roberson jr.,rachel crossingapt.,41010 923,,p2235417,thompsonshire amyport,,,GB08OTHR53515837682953,,30008244
3,e4fba5f878dd3453e35973605a783a16,BENE,azquez-nelson co. suarez ports suite & 024 bri...,azquez-nelson co.,ports suite &,,,,brittanyberg,,bulgaria bulgaria,GB17VVGW66321494633280,,40017944
4,d03d7e4c31878b0255d39e8c3f0ab625,ORG,m.j. bytd iii 856 john lake s. glenn cocos (ke...,m.j. bytd iii,john lake s. glenn,856,,125838276,cocos (keeling),islands,,,(260)3371534,40012658


In [55]:
external_parties_train.columns

Index(['transaction_reference_id', 'party_role', 'party_info_unstructured',
       'parsed_name', 'parsed_address_street_name',
       'parsed_address_street_number', 'parsed_address_unit',
       'parsed_address_postal_code', 'parsed_address_city',
       'parsed_address_state', 'parsed_address_country', 'party_iban',
       'party_phone', 'external_id'],
      dtype='object')

In [56]:
external_parties_train['party_info_unstructured']

0        mary mith 107 107 angela brooks n. thomasfurt ...
1                 yesneia kim north michael 93971 koribati
2        w. roberson jr. 41010 rachel crossingapt. 923 ...
3        azquez-nelson co. suarez ports suite & 024 bri...
4        m.j. bytd iii 856 john lake s. glenn cocos (ke...
                               ...                        
11059                 james alvarado jr. port james zambia
11060             marcnguyen 234 price meadows robertmouth
11061    joseph davis 14131 taylor villages apt. 764 s....
11062                         gonzalezltd 94129lozano cape
11063    simmons-conway 1241 smith prairie suite 369 no...
Name: party_info_unstructured, Length: 11064, dtype: object

# idea (1)
Phone number are just phone numbers! they have to have a fixed lenght (max 15 character, including the prefix) and be composed only by numbers.

SO, we remove special characters and letters from phone numbers.

In [57]:
import re

In [58]:
external_parties_train['cleaned_party_number'] = (
    external_parties_train['party_phone']
    .str.replace(r'x.*$', '', regex=True)  # Removes everything after 'x' (including 'x')
    .str.replace(r'[^0-9]', '', regex=True)  # Removes all characters that are not numbers
    .str[:15]  # Keep only the first 15 numeric characters
)

external_parties_train[['party_phone','cleaned_party_number']].head()


Unnamed: 0,party_phone,cleaned_party_number
0,+1.815660-6791x8486,18156606791.0
1,0 (269)620-8734x2349,2696208734.0
2,,
3,,
4,(260)3371534,2603371534.0


# idea (2)
Looking at what's in that column other than nan we see that there are useless parentheses and that often the names are with the wrong spelling and repeated two times.

In [59]:
# function for duplicates
def clean_and_deduplicate_country(country_name):
    if pd.isna(country_name):
        return country_name    
    # Separare le parole e rimuovere le ripetizioni
    words = country_name.split()
    unique_words = set(words)  # Usa un set per rimuovere le ripetizioni
    cleaned_country = ' '.join(sorted(unique_words))  # Ricostruire la stringa (opzionalmente ordinata)
    return cleaned_country


In [60]:
external_parties_train['clean_parsed_address_country'] = external_parties_train['parsed_address_country'].str.replace(r'[().]', '', regex=True).str.replace(r'\d+', '', regex=True)
external_parties_train['clean_parsed_address_country'] = external_parties_train['clean_parsed_address_country'].apply(lambda x: clean_and_deduplicate_country(x))

In [61]:
external_parties_train[['parsed_address_country','clean_parsed_address_country']]

Unnamed: 0,parsed_address_country,clean_parsed_address_country
0,bulgaria,bulgaria
1,,
2,,
3,bulgaria bulgaria,bulgaria
4,,
...,...,...
11059,zambia,zambia
11060,,
11061,,
11062,,


# idea (3)
parsed_address_street_name

In [62]:
external_parties_train['parsed_address_street_name']

0                                 angela brooks
1                                 north michael
2                           rachel crossingapt.
3                                 ports suite &
4                            john lake s. glenn
                          ...                  
11059                                     james
11060                             price meadows
11061    taylor villages apt. 764 kimberlymouth
11062                                       NaN
11063                       smith prairie suite
Name: parsed_address_street_name, Length: 11064, dtype: object

In [63]:
external_parties_train['clean_parsed_address_street_name'] = external_parties_train['parsed_address_street_name'].str.replace(r'&', '', regex=True).str.replace(r'\d+', '', regex=True)
external_parties_train[['parsed_address_street_name','clean_parsed_address_street_name']]

Unnamed: 0,parsed_address_street_name,clean_parsed_address_street_name
0,angela brooks,angela brooks
1,north michael,north michael
2,rachel crossingapt.,rachel crossingapt.
3,ports suite &,ports suite
4,john lake s. glenn,john lake s. glenn
...,...,...
11059,james,james
11060,price meadows,price meadows
11061,taylor villages apt. 764 kimberlymouth,taylor villages apt. kimberlymouth
11062,,


In [64]:
external_parties_train.head()

Unnamed: 0,transaction_reference_id,party_role,party_info_unstructured,parsed_name,parsed_address_street_name,parsed_address_street_number,parsed_address_unit,parsed_address_postal_code,parsed_address_city,parsed_address_state,parsed_address_country,party_iban,party_phone,external_id,cleaned_party_number,clean_parsed_address_country,clean_parsed_address_street_name
0,04ff0d1c680189e3a80c92d86407f0f5,BENE,mary mith 107 107 angela brooks n. thomasfurt ...,mary mith,angela brooks,107 107,,,n. thomasfurt,,bulgaria,GB49MYOB82127728573340,+1.815660-6791x8486,50039037,18156606791.0,bulgaria,angela brooks
1,439ab0ad7380e6135ab2ff3fddd4a727,ORG,yesneia kim north michael 93971 koribati,yesneia kim,north michael,,,93971,koribati,,,,0 (269)620-8734x2349,60044692,2696208734.0,,north michael
2,00cac12d41191a84f9e31aa731a83512,ORG,w. roberson jr. 41010 rachel crossingapt. 923 ...,w. roberson jr.,rachel crossingapt.,41010 923,,p2235417,thompsonshire amyport,,,GB08OTHR53515837682953,,30008244,,,rachel crossingapt.
3,e4fba5f878dd3453e35973605a783a16,BENE,azquez-nelson co. suarez ports suite & 024 bri...,azquez-nelson co.,ports suite &,,,,brittanyberg,,bulgaria bulgaria,GB17VVGW66321494633280,,40017944,,bulgaria,ports suite
4,d03d7e4c31878b0255d39e8c3f0ab625,ORG,m.j. bytd iii 856 john lake s. glenn cocos (ke...,m.j. bytd iii,john lake s. glenn,856,,125838276,cocos (keeling),islands,,,(260)3371534,40012658,2603371534.0,,john lake s. glenn
