# Lauzhack 2024 UBS Challange

In [5]:
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
account_booking_train = pd.read_csv('data/account_booking_train.csv')
external_parties_train = pd.read_csv('data/external_parties_train.csv')


In [7]:
# Clean the Data
# Step 1: Identify transaction counts by `transaction_reference_id`
transaction_counts = account_booking_train['transaction_reference_id'].value_counts()

# Step 2: Filter transactions with only one leg
single_leg_transactions = transaction_counts[transaction_counts == 1].index

# Step 3: Filter the dataset for these transactions
account_booking_train_cleaned = account_booking_train[
    account_booking_train['transaction_reference_id'].isin(single_leg_transactions)
]

# Display the first few rows of the filtered dataset
account_booking_train_cleaned.head()

Unnamed: 0,transaction_reference_id,debit_credit_indicator,account_id,transaction_amount,transaction_currency,transaction_date
0,0ace8fca6ada96883ef2e823b5dea26b,CREDIT,25110,5249.26,GBP,2023-05-15
1,d52c4f1a546f5d784ee46a8f347ad607,DEBIT,27293,4481.5,GBP,2023-02-27
2,dac45362e7471a7fa2726c81adae8534,DEBIT,23088,1347.27,GBP,2023-03-18
3,2ee574398cd6c4a7f3e111447141550e,DEBIT,21641,9276.56,GBP,2023-07-15
4,c5154ea99a0ff84ba8e72217d34d3397,CREDIT,24233,8002.28,GBP,2024-01-12


In [8]:
# Step 1: Data Overview
print("Transactions Dataset Overview:")
print(account_booking_train_cleaned.info())
print("\nTransactions Dataset Summary:")
print(account_booking_train_cleaned.describe())

print("\nExternal Parties Dataset Overview:")
print(external_parties_train.info())
print("\nExternal Parties Dataset Summary:")
print(external_parties_train.describe())

# Check for missing values
print("\nMissing Values in Transactions Table:")
print(account_booking_train_cleaned.isnull().sum())

print("\nMissing Values in External Parties Table:")
print(external_parties_train.isnull().sum())

# Step 2: Key Relationships
# Merge datasets on transaction_reference_id
merged_data = pd.merge(
    external_parties_train, 
    account_booking_train_cleaned, 
    on='transaction_reference_id', 
    how='inner'
)

print("\nMerged Dataset Overview:")
print(merged_data.info())

# Distribution of party roles
party_role_dist = external_parties_train['party_role'].value_counts()
plt.figure(figsize=(8, 5))
party_role_dist.plot(kind='bar', title="Distribution of Party Roles (ORG vs BENE)")
plt.xlabel("Party Role")
plt.ylabel("Count")
plt.show()

# Step 3: Parsed Address Analysis
country_counts = external_parties_train['parsed_address_country'].value_counts()
plt.figure(figsize=(10, 6))
country_counts.head(10).plot(kind='bar', title="Top 10 Countries by External Parties")
plt.xlabel("Country")
plt.ylabel("Count")
plt.show()

# Step 4: External ID Analysis
external_id_counts = external_parties_train['external_id'].value_counts()
print(f"\nNumber of Unique External IDs: {external_id_counts.nunique()}")
print(f"Top 10 Most Frequent External IDs:\n{external_id_counts.head(10)}")

# Step 5: Relationships with Transactions
# Analyze transaction amounts for external parties
merged_data.groupby('external_id')['transaction_amount'].mean().sort_values(ascending=False).head(10).plot(
    kind='bar', figsize=(10, 6), title="Top 10 External IDs by Average Transaction Amount"
)
plt.xlabel("External ID")
plt.ylabel("Average Transaction Amount")
plt.show()

# Step 6: Address Completeness
address_completeness = external_parties_train[['parsed_address_street_name', 'parsed_address_city', 'parsed_address_country']].notnull().mean()
address_completeness.plot(kind='bar', figsize=(8, 5), title="Address Completeness by Field")
plt.ylabel("Proportion of Non-Missing Values")
plt.show()

Transactions Dataset Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11464 entries, 0 to 11463
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   transaction_reference_id  11464 non-null  object 
 1   debit_credit_indicator    11464 non-null  object 
 2   account_id                11464 non-null  int64  
 3   transaction_amount        11464 non-null  float64
 4   transaction_currency      11464 non-null  object 
 5   transaction_date          11464 non-null  object 
dtypes: float64(1), int64(1), object(4)
memory usage: 537.5+ KB
None

Transactions Dataset Summary:
         account_id  transaction_amount
count  11464.000000        11464.000000
mean   25000.635642         5020.558171
std     2892.025223         2857.300235
min    20000.000000           10.880000
25%    22489.250000         2548.405000
50%    25001.000000         5001.005000
75%    27473.250000         7510.245000
m

NameError: name 'plt' is not defined