In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=0c58e7a894ae937f8a8cb41bcd5ebed59deb74162ef429e287634319f0b495bb
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Customer Transactions Analysis") \
    .getOrCreate()

# Sample data for customers
customers = [
    (1, "Ravi", "Mumbai"),
    (2, "Priya", "Delhi"),
    (3, "Vijay", "Bangalore"),
    (4, "Anita", "Chennai"),
    (5, "Baj", "Hyderabad")
]

# Sample data for transactions
transactions = [
    (1, 1, 10000.50),
    (2, 2, 20000.75),
    (3, 1, 15000.25),
    (4, 3, 30000.00),
    (5, 2, 40000.50),
    (6, 4, 25000.00),
    (7, 5, 18000.75),
    (8, 1, 5000.00)
]

# Define schema for DataFrames

customer_columns = ["CustomerID", "Name", "City"]

transaction_columns = ["TransactionID", "CustomerID", "Amount"]

#Create DataFrames

customer_df = spark.createDataFrame (customers, schema=customer_columns)

transaction_df = spark.createDataFrame (transactions, schema=transaction_columns)

# Show the DataFrames

print("Customers DataFrame:")

customer_df.show()

print("Transactions DataFrame:")

transaction_df.show()

Customers DataFrame:
+----------+-----+---------+
|CustomerID| Name|     City|
+----------+-----+---------+
|         1| Ravi|   Mumbai|
|         2|Priya|    Delhi|
|         3|Vijay|Bangalore|
|         4|Anita|  Chennai|
|         5|  Baj|Hyderabad|
+----------+-----+---------+

Transactions DataFrame:
+-------------+----------+--------+
|TransactionID|CustomerID|  Amount|
+-------------+----------+--------+
|            1|         1| 10000.5|
|            2|         2|20000.75|
|            3|         1|15000.25|
|            4|         3| 30000.0|
|            5|         2| 40000.5|
|            6|         4| 25000.0|
|            7|         5|18000.75|
|            8|         1|  5000.0|
+-------------+----------+--------+



In [None]:
#Join the DataFrames on CustomerID
customer_transactions_df = customer_df.join(transaction_df, on="CustomerID")
print("Customer Transactions DataFrame:")
customer_transactions_df.show()

# Calculate the total amount spent by each customer
total_spent_df = customer_transactions_df.groupBy("Name").sum("Amount").withColumnRenamed("sum(Amount)", "TotalSpent")

print("Total Amount Spent by Each Customer:")
total_spent_df.show()

# Find customers who have spent more than 30,000
big_spenders_df = total_spent_df.filter(total_spent_df.TotalSpent > 30000)

print("Customers Who Spent More Than 30,000:")
big_spenders_df.show()

## Count the number of transactions per customer
transactions_count_df = customer_transactions_df.groupBy("Name").count().withColumnRenamed("count",
"TransactionCount")
print("Number of Transactions Per Customer:")
transactions_count_df.show()

## Sort customers by total amount spent in descending order
sorted_spenders_df = total_spent_df.orderBy (col ("TotalSpent").desc())
print("Customers Sorted by Total Spent (Descending):")
sorted_spenders_df.show()

Customer Transactions DataFrame:
+----------+-----+---------+-------------+--------+
|CustomerID| Name|     City|TransactionID|  Amount|
+----------+-----+---------+-------------+--------+
|         1| Ravi|   Mumbai|            1| 10000.5|
|         1| Ravi|   Mumbai|            3|15000.25|
|         1| Ravi|   Mumbai|            8|  5000.0|
|         2|Priya|    Delhi|            2|20000.75|
|         2|Priya|    Delhi|            5| 40000.5|
|         3|Vijay|Bangalore|            4| 30000.0|
|         4|Anita|  Chennai|            6| 25000.0|
|         5|  Baj|Hyderabad|            7|18000.75|
+----------+-----+---------+-------------+--------+

Total Amount Spent by Each Customer:
+-----+----------+
| Name|TotalSpent|
+-----+----------+
| Ravi|  30000.75|
|  Baj|  18000.75|
|Priya|  60001.25|
|Vijay|   30000.0|
|Anita|   25000.0|
+-----+----------+

Customers Who Spent More Than 30,000:
+-----+----------+
| Name|TotalSpent|
+-----+----------+
| Ravi|  30000.75|
|Priya|  60001.25|
