In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum, max as spark_max

spark = SparkSession.builder \
    .appName("Financial Dataset Analysis") \
    .getOrCreate()

# Generate sample data
import numpy as np
import pandas as pd

accounts = ['ACC123456', 'ACC234567', 'ACC345678', 'ACC456789', 'ACC567890']
merchants = ['ABC Store', 'ATM', 'Employer', 'Stock Exchange', 'XYZ Electronics', 'Online Store', 'Cafe', 'Supermarket']

np.random.seed(0)
data = {
    'Transaction ID': np.arange(1000000000, 1000000000 + 50),
    'Account ID': np.random.choice(accounts, 50),
    'Transaction Amount': np.round(np.random.uniform(10.00, 2000.00, 50), 2),
    'Transaction Type': np.random.choice(['Purchase', 'Withdrawal', 'Deposit', 'Trade'], 50),
    'Merchant/Counterparty': np.random.choice(merchants, 50),
    'Location': np.random.choice(['New York, NY', 'Los Angeles, CA', 'Chicago, IL', 'Miami, FL', 'Online'], 50),
    'Date and Time': pd.date_range(start='2024-08-01', periods=50, freq='H').strftime('%Y-%m-%d %H:%M').tolist()
}

# Create Spark DataFrame
pdf = pd.DataFrame(data)
df = spark.createDataFrame(pdf)

# Group by 'Account ID' and 'Merchant/Counterparty' and sum 'Transaction Amount'
grouped_df = df.groupBy('Account ID', 'Merchant/Counterparty').agg(spark_sum('Transaction Amount').alias('Total Spending'))

# Find the maximum 'Total Spending' for each 'Account ID'
max_spending_df = grouped_df.groupBy('Account ID').agg(spark_max('Total Spending').alias('Max Spending'))

highest_spending_df = grouped_df.join(max_spending_df,
                                      on=['Account ID'],
                                      how='inner') \
                                .filter(col('Total Spending') == col('Max Spending'))

highest_spending_df = highest_spending_df.drop('Max Spending')
highest_spending_df.show(truncate=False)


  'Date and Time': pd.date_range(start='2024-08-01', periods=50, freq='H').strftime('%Y-%m-%d %H:%M').tolist()


+----------+---------------------+------------------+
|Account ID|Merchant/Counterparty|Total Spending    |
+----------+---------------------+------------------+
|ACC456789 |ABC Store            |2172.0699999999997|
|ACC567890 |ABC Store            |4290.040000000001 |
|ACC234567 |ATM                  |4444.66           |
|ACC345678 |ATM                  |1119.3600000000001|
|ACC123456 |ABC Store            |3434.28           |
+----------+---------------------+------------------+

