This Notebook will write a Parquet file to your Google Drive. In order for it to do so, you will have to give it access. When prompted, select all permissions.

In [1]:
pip install fastparquet

Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading fastparquet-2024.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m20.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fastparquet
Successfully installed fastparquet-2024.11.0


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
import pandas as pd
import numpy as np
import os

np.random.seed(42)

accounts = ['ACC123456', 'ACC234567', 'ACC345678', 'ACC456789', 'ACC567890']
merchants = ['ABC Store', 'ATM', 'Employer', 'Stock Exchange', 'XYZ Electronics', 'Online Store', 'Cafe', 'Supermarket']

n_rows = 10000

data = {
    'Transaction ID': np.arange(1000000000, 1000000000 + n_rows),
    'Account ID': np.random.choice(accounts, n_rows),
    'Transaction Amount': np.round(np.random.uniform(10.00, 2000.00, n_rows), 2),
    'Transaction Type': np.random.choice(['Purchase', 'Withdrawal', 'Deposit', 'Trade'], n_rows),
    'Merchant/Counterparty': np.random.choice(merchants, n_rows),
    'Location': np.random.choice(['New York, NY', 'Los Angeles, CA', 'Chicago, IL', 'Miami, FL', 'Online'], n_rows),
    'Date and Time': pd.date_range(start='2024-08-01', periods=n_rows, freq='H').strftime('%Y-%m-%d %H:%M').tolist()
}

data = pd.DataFrame(data)

parquet_folder = "/content/drive/MyDrive/Lecture2"

if not os.path.exists(parquet_folder):
    os.makedirs(parquet_folder)

parquet_file = f"{parquet_folder}/financial.parquet"
data.to_parquet(parquet_file, engine='fastparquet', index=False)

  'Date and Time': pd.date_range(start='2024-08-01', periods=n_rows, freq='H').strftime('%Y-%m-%d %H:%M').tolist()


In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as spark_sum, max as spark_max

spark = SparkSession.builder \
    .appName("Financial Dataset Analysis") \
    .getOrCreate()

df = spark.read.parquet(parquet_file)

# Group by 'Account ID' and 'Merchant/Counterparty' and sum 'Transaction Amount'
grouped_df = df.groupBy('Account ID', 'Merchant/Counterparty').agg(spark_sum('Transaction Amount').alias('Total Spending'))

# Find the maximum 'Total Spending' for each 'Account ID'
max_spending_df = grouped_df.groupBy('Account ID').agg(spark_max('Total Spending').alias('Max Spending'))

highest_spending_df = grouped_df.join(max_spending_df,
                                      on=['Account ID'],
                                      how='inner') \
                                .filter(col('Total Spending') == col('Max Spending'))

highest_spending_df = highest_spending_df.drop('Max Spending')
highest_spending_df.show(truncate=False)


+----------+---------------------+------------------+
|Account ID|Merchant/Counterparty|Total Spending    |
+----------+---------------------+------------------+
|ACC234567 |XYZ Electronics      |278558.4099999999 |
|ACC567890 |Employer             |291355.0          |
|ACC345678 |Cafe                 |271944.07999999996|
|ACC123456 |Stock Exchange       |279712.87000000005|
|ACC456789 |XYZ Electronics      |266071.94999999995|
+----------+---------------------+------------------+

