In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("FinancialDataset").getOrCreate()

data = [
    ("1234567890", "ACC123456", 150.75, "Purchase", "ABC Store", "New York, NY", "2024-08-25 14:32"),
    ("2345678901", "ACC234567", 500.00, "Withdrawal", "ATM", "Los Angeles, CA", "2024-08-25 09:15"),
    ("3456789012", "ACC345678", 2000.00, "Deposit", "Employer", "Chicago, IL", "2024-08-24 16:45"),
    ("4567890123", "ACC456789", 250.00, "Trade", "Stock Exchange", "Online", "2024-08-24 10:00"),
    ("5678901234", "ACC567890", 75.50, "Purchase", "XYZ Electronics", "Miami, FL", "2024-08-23 18:22"),
    ("5678901235", "ACC567890", 500.00, "Purchase", "Amazon", "Miami, FL", "2024-08-23 19:22"),
]

schema = ["Transaction ID", "Account ID", "Transaction Amount", "Transaction Type", "Merchant/Counterparty", "Location", "Date and Time"]

df = spark.createDataFrame(data, schema=schema)

df.show()

+--------------+----------+------------------+----------------+---------------------+---------------+----------------+
|Transaction ID|Account ID|Transaction Amount|Transaction Type|Merchant/Counterparty|       Location|   Date and Time|
+--------------+----------+------------------+----------------+---------------------+---------------+----------------+
|    1234567890| ACC123456|            150.75|        Purchase|            ABC Store|   New York, NY|2024-08-25 14:32|
|    2345678901| ACC234567|             500.0|      Withdrawal|                  ATM|Los Angeles, CA|2024-08-25 09:15|
|    3456789012| ACC345678|            2000.0|         Deposit|             Employer|    Chicago, IL|2024-08-24 16:45|
|    4567890123| ACC456789|             250.0|           Trade|       Stock Exchange|         Online|2024-08-24 10:00|
|    5678901234| ACC567890|              75.5|        Purchase|      XYZ Electronics|      Miami, FL|2024-08-23 18:22|
|    5678901235| ACC567890|             500.0|  

In [None]:
df.select("Transaction ID", "Transaction Amount").groupBy()

+--------------+------------------+
|Transaction ID|Transaction Amount|
+--------------+------------------+
|    1234567890|            150.75|
|    2345678901|             500.0|
|    3456789012|            2000.0|
|    4567890123|             250.0|
|    5678901234|              75.5|
|    5678901235|             500.0|
+--------------+------------------+



In [None]:
df.withColumn("Amount in Cents", (df["Transaction Amount"] * 100).cast("int")).show()

+--------------+----------+------------------+----------------+---------------------+---------------+----------------+---------------+
|Transaction ID|Account ID|Transaction Amount|Transaction Type|Merchant/Counterparty|       Location|   Date and Time|Amount in Cents|
+--------------+----------+------------------+----------------+---------------------+---------------+----------------+---------------+
|    1234567890| ACC123456|            150.75|        Purchase|            ABC Store|   New York, NY|2024-08-25 14:32|          15075|
|    2345678901| ACC234567|             500.0|      Withdrawal|                  ATM|Los Angeles, CA|2024-08-25 09:15|          50000|
|    3456789012| ACC345678|            2000.0|         Deposit|             Employer|    Chicago, IL|2024-08-24 16:45|         200000|
|    4567890123| ACC456789|             250.0|           Trade|       Stock Exchange|         Online|2024-08-24 10:00|          25000|
|    5678901234| ACC567890|              75.5|        P

In [None]:
df.filter(df["Transaction Amount"] > 100).show()

+--------------+----------+------------------+----------------+---------------------+---------------+----------------+
|Transaction ID|Account ID|Transaction Amount|Transaction Type|Merchant/Counterparty|       Location|   Date and Time|
+--------------+----------+------------------+----------------+---------------------+---------------+----------------+
|    1234567890| ACC123456|            150.75|        Purchase|            ABC Store|   New York, NY|2024-08-25 14:32|
|    2345678901| ACC234567|             500.0|      Withdrawal|                  ATM|Los Angeles, CA|2024-08-25 09:15|
|    3456789012| ACC345678|            2000.0|         Deposit|             Employer|    Chicago, IL|2024-08-24 16:45|
|    4567890123| ACC456789|             250.0|           Trade|       Stock Exchange|         Online|2024-08-24 10:00|
|    5678901235| ACC567890|             500.0|        Purchase|               Amazon|      Miami, FL|2024-08-23 19:22|
+--------------+----------+------------------+--

In [None]:
df.replace({"Purchase": "Shopping"}, subset=["Transaction Type"]).show()

+--------------+----------+------------------+----------------+---------------------+---------------+----------------+
|Transaction ID|Account ID|Transaction Amount|Transaction Type|Merchant/Counterparty|       Location|   Date and Time|
+--------------+----------+------------------+----------------+---------------------+---------------+----------------+
|    1234567890| ACC123456|            150.75|        Shopping|            ABC Store|   New York, NY|2024-08-25 14:32|
|    2345678901| ACC234567|             500.0|      Withdrawal|                  ATM|Los Angeles, CA|2024-08-25 09:15|
|    3456789012| ACC345678|            2000.0|         Deposit|             Employer|    Chicago, IL|2024-08-24 16:45|
|    4567890123| ACC456789|             250.0|           Trade|       Stock Exchange|         Online|2024-08-24 10:00|
|    5678901234| ACC567890|              75.5|        Shopping|      XYZ Electronics|      Miami, FL|2024-08-23 18:22|
|    5678901235| ACC567890|             500.0|  

In [None]:
df.dropna().show()

+--------------+----------+------------------+----------------+---------------------+---------------+----------------+
|Transaction ID|Account ID|Transaction Amount|Transaction Type|Merchant/Counterparty|       Location|   Date and Time|
+--------------+----------+------------------+----------------+---------------------+---------------+----------------+
|    1234567890| ACC123456|            150.75|        Purchase|            ABC Store|   New York, NY|2024-08-25 14:32|
|    2345678901| ACC234567|             500.0|      Withdrawal|                  ATM|Los Angeles, CA|2024-08-25 09:15|
|    3456789012| ACC345678|            2000.0|         Deposit|             Employer|    Chicago, IL|2024-08-24 16:45|
|    4567890123| ACC456789|             250.0|           Trade|       Stock Exchange|         Online|2024-08-24 10:00|
|    5678901234| ACC567890|              75.5|        Purchase|      XYZ Electronics|      Miami, FL|2024-08-23 18:22|
|    5678901235| ACC567890|             500.0|  

In [None]:
df.agg({"Transaction Amount": "sum"}).show()

+-----------------------+
|sum(Transaction Amount)|
+-----------------------+
|                3476.25|
+-----------------------+



In [None]:
df.groupBy("Account ID").agg({"Transaction Amount": "sum"}).show()

+----------+-----------------------+
|Account ID|sum(Transaction Amount)|
+----------+-----------------------+
| ACC234567|                  500.0|
| ACC345678|                 2000.0|
| ACC123456|                 150.75|
| ACC567890|                  575.5|
| ACC456789|                  250.0|
+----------+-----------------------+



In [None]:
df.cube("Account ID", "Transaction Type").sum("Transaction Amount").show()


+----------+----------------+-----------------------+
|Account ID|Transaction Type|sum(Transaction Amount)|
+----------+----------------+-----------------------+
|      NULL|         Deposit|                 2000.0|
| ACC234567|      Withdrawal|                  500.0|
| ACC345678|            NULL|                 2000.0|
|      NULL|            NULL|                3476.25|
| ACC123456|            NULL|                 150.75|
| ACC345678|         Deposit|                 2000.0|
| ACC234567|            NULL|                  500.0|
| ACC123456|        Purchase|                 150.75|
|      NULL|      Withdrawal|                  500.0|
|      NULL|        Purchase|                 726.25|
| ACC567890|            NULL|                  575.5|
| ACC456789|           Trade|                  250.0|
| ACC567890|        Purchase|                  575.5|
| ACC456789|            NULL|                  250.0|
|      NULL|           Trade|                  250.0|
+----------+----------------

In [None]:
accounts_df = spark.createDataFrame([("ACC123456", "John Doe"), ("ACC234567", "Jane Smith")], ["Account ID", "Account Holder"])
df.join(accounts_df, "Account ID").show()


+----------+--------------+------------------+----------------+---------------------+---------------+----------------+--------------+
|Account ID|Transaction ID|Transaction Amount|Transaction Type|Merchant/Counterparty|       Location|   Date and Time|Account Holder|
+----------+--------------+------------------+----------------+---------------------+---------------+----------------+--------------+
| ACC123456|    1234567890|            150.75|        Purchase|            ABC Store|   New York, NY|2024-08-25 14:32|      John Doe|
| ACC234567|    2345678901|             500.0|      Withdrawal|                  ATM|Los Angeles, CA|2024-08-25 09:15|    Jane Smith|
+----------+--------------+------------------+----------------+---------------------+---------------+----------------+--------------+



In [None]:
from pyspark.sql.functions import broadcast
df.join(broadcast(accounts_df), "Account ID").show()


+----------+--------------+------------------+----------------+---------------------+---------------+----------------+--------------+
|Account ID|Transaction ID|Transaction Amount|Transaction Type|Merchant/Counterparty|       Location|   Date and Time|Account Holder|
+----------+--------------+------------------+----------------+---------------------+---------------+----------------+--------------+
| ACC123456|    1234567890|            150.75|        Purchase|            ABC Store|   New York, NY|2024-08-25 14:32|      John Doe|
| ACC234567|    2345678901|             500.0|      Withdrawal|                  ATM|Los Angeles, CA|2024-08-25 09:15|    Jane Smith|
+----------+--------------+------------------+----------------+---------------------+---------------+----------------+--------------+



In [None]:
df.orderBy("Date and Time").show()

+--------------+----------+------------------+----------------+---------------------+---------------+----------------+
|Transaction ID|Account ID|Transaction Amount|Transaction Type|Merchant/Counterparty|       Location|   Date and Time|
+--------------+----------+------------------+----------------+---------------------+---------------+----------------+
|    5678901234| ACC567890|              75.5|        Purchase|      XYZ Electronics|      Miami, FL|2024-08-23 18:22|
|    5678901235| ACC567890|             500.0|        Purchase|               Amazon|      Miami, FL|2024-08-23 19:22|
|    4567890123| ACC456789|             250.0|           Trade|       Stock Exchange|         Online|2024-08-24 10:00|
|    3456789012| ACC345678|            2000.0|         Deposit|             Employer|    Chicago, IL|2024-08-24 16:45|
|    2345678901| ACC234567|             500.0|      Withdrawal|                  ATM|Los Angeles, CA|2024-08-25 09:15|
|    1234567890| ACC123456|            150.75|  

In [None]:
additional_data = [("6789012345", "ACC678901", 1000.0, "Purchase", "New Store", "Dallas, TX", "2024-08-30 12:00")]
df2 = spark.createDataFrame(additional_data, schema)
df.union(df2).show()


+--------------+----------+------------------+----------------+---------------------+---------------+----------------+
|Transaction ID|Account ID|Transaction Amount|Transaction Type|Merchant/Counterparty|       Location|   Date and Time|
+--------------+----------+------------------+----------------+---------------------+---------------+----------------+
|    1234567890| ACC123456|            150.75|        Purchase|            ABC Store|   New York, NY|2024-08-25 14:32|
|    2345678901| ACC234567|             500.0|      Withdrawal|                  ATM|Los Angeles, CA|2024-08-25 09:15|
|    3456789012| ACC345678|            2000.0|         Deposit|             Employer|    Chicago, IL|2024-08-24 16:45|
|    4567890123| ACC456789|             250.0|           Trade|       Stock Exchange|         Online|2024-08-24 10:00|
|    5678901234| ACC567890|              75.5|        Purchase|      XYZ Electronics|      Miami, FL|2024-08-23 18:22|
|    5678901235| ACC567890|             500.0|  

In [None]:
df2 = df.filter(df["Account ID"] == "ACC567890")
df.intersect(df2).show()

+--------------+----------+------------------+----------------+---------------------+---------+----------------+
|Transaction ID|Account ID|Transaction Amount|Transaction Type|Merchant/Counterparty| Location|   Date and Time|
+--------------+----------+------------------+----------------+---------------------+---------+----------------+
|    5678901235| ACC567890|             500.0|        Purchase|               Amazon|Miami, FL|2024-08-23 19:22|
|    5678901234| ACC567890|              75.5|        Purchase|      XYZ Electronics|Miami, FL|2024-08-23 18:22|
+--------------+----------+------------------+----------------+---------------------+---------+----------------+



In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

window_spec = Window.partitionBy("Account ID").orderBy(df["Transaction Amount"].desc())
df.withColumn("Rank", rank().over(window_spec)).show()


+--------------+----------+------------------+----------------+---------------------+---------------+----------------+----+
|Transaction ID|Account ID|Transaction Amount|Transaction Type|Merchant/Counterparty|       Location|   Date and Time|Rank|
+--------------+----------+------------------+----------------+---------------------+---------------+----------------+----+
|    1234567890| ACC123456|            150.75|        Purchase|            ABC Store|   New York, NY|2024-08-25 14:32|   1|
|    2345678901| ACC234567|             500.0|      Withdrawal|                  ATM|Los Angeles, CA|2024-08-25 09:15|   1|
|    3456789012| ACC345678|            2000.0|         Deposit|             Employer|    Chicago, IL|2024-08-24 16:45|   1|
|    4567890123| ACC456789|             250.0|           Trade|       Stock Exchange|         Online|2024-08-24 10:00|   1|
|    5678901235| ACC567890|             500.0|        Purchase|               Amazon|      Miami, FL|2024-08-23 19:22|   1|
|    567

In [None]:
df.cache()

DataFrame[Transaction ID: string, Account ID: string, Transaction Amount: double, Transaction Type: string, Merchant/Counterparty: string, Location: string, Date and Time: string]

In [None]:
df.repartition(4).rdd.getNumPartitions()

4

In [None]:
df.sample(withReplacement=False, fraction=0.5).show()


+--------------+----------+------------------+----------------+---------------------+---------------+----------------+
|Transaction ID|Account ID|Transaction Amount|Transaction Type|Merchant/Counterparty|       Location|   Date and Time|
+--------------+----------+------------------+----------------+---------------------+---------------+----------------+
|    2345678901| ACC234567|             500.0|      Withdrawal|                  ATM|Los Angeles, CA|2024-08-25 09:15|
|    3456789012| ACC345678|            2000.0|         Deposit|             Employer|    Chicago, IL|2024-08-24 16:45|
|    5678901235| ACC567890|             500.0|        Purchase|               Amazon|      Miami, FL|2024-08-23 19:22|
+--------------+----------+------------------+----------------+---------------------+---------------+----------------+



In [None]:
rows = df.collect()

print(rows)

[Row(Transaction ID='1234567890', Account ID='ACC123456', Transaction Amount=150.75, Transaction Type='Purchase', Merchant/Counterparty='ABC Store', Location='New York, NY', Date and Time='2024-08-25 14:32'), Row(Transaction ID='2345678901', Account ID='ACC234567', Transaction Amount=500.0, Transaction Type='Withdrawal', Merchant/Counterparty='ATM', Location='Los Angeles, CA', Date and Time='2024-08-25 09:15'), Row(Transaction ID='3456789012', Account ID='ACC345678', Transaction Amount=2000.0, Transaction Type='Deposit', Merchant/Counterparty='Employer', Location='Chicago, IL', Date and Time='2024-08-24 16:45'), Row(Transaction ID='4567890123', Account ID='ACC456789', Transaction Amount=250.0, Transaction Type='Trade', Merchant/Counterparty='Stock Exchange', Location='Online', Date and Time='2024-08-24 10:00'), Row(Transaction ID='5678901234', Account ID='ACC567890', Transaction Amount=75.5, Transaction Type='Purchase', Merchant/Counterparty='XYZ Electronics', Location='Miami, FL', Dat

In [None]:
three_rows = df.head(3)

print(three_rows)

[Row(Transaction ID='1234567890', Account ID='ACC123456', Transaction Amount=150.75, Transaction Type='Purchase', Merchant/Counterparty='ABC Store', Location='New York, NY', Date and Time='2024-08-25 14:32'), Row(Transaction ID='2345678901', Account ID='ACC234567', Transaction Amount=500.0, Transaction Type='Withdrawal', Merchant/Counterparty='ATM', Location='Los Angeles, CA', Date and Time='2024-08-25 09:15'), Row(Transaction ID='3456789012', Account ID='ACC345678', Transaction Amount=2000.0, Transaction Type='Deposit', Merchant/Counterparty='Employer', Location='Chicago, IL', Date and Time='2024-08-24 16:45')]


In [None]:
df.write.csv("output/transactions", header=True)


In [None]:
df.write.saveAsTable("transactions_table")


In [None]:
row_count = df.count()

print(row_count)


6


In [None]:
quantiles = df.approxQuantile("Transaction Amount", [0.25, 0.5, 0.75], 0.05)

print(quantiles)


[150.75, 250.0, 500.0]


In [None]:
df.show(5, truncate=False)


+--------------+----------+------------------+----------------+---------------------+---------------+----------------+
|Transaction ID|Account ID|Transaction Amount|Transaction Type|Merchant/Counterparty|Location       |Date and Time   |
+--------------+----------+------------------+----------------+---------------------+---------------+----------------+
|1234567890    |ACC123456 |150.75            |Purchase        |ABC Store            |New York, NY   |2024-08-25 14:32|
|2345678901    |ACC234567 |500.0             |Withdrawal      |ATM                  |Los Angeles, CA|2024-08-25 09:15|
|3456789012    |ACC345678 |2000.0            |Deposit         |Employer             |Chicago, IL    |2024-08-24 16:45|
|4567890123    |ACC456789 |250.0             |Trade           |Stock Exchange       |Online         |2024-08-24 10:00|
|5678901234    |ACC567890 |75.5              |Purchase        |XYZ Electronics      |Miami, FL      |2024-08-23 18:22|
+--------------+----------+------------------+--

In [None]:
df.describe("Transaction Amount").show()


+-------+------------------+
|summary|Transaction Amount|
+-------+------------------+
|  count|                 6|
|   mean|           579.375|
| stddev| 717.8960187589844|
|    min|              75.5|
|    max|            2000.0|
+-------+------------------+

