In [1]:
!pip install pyspark



In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count, avg
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
import pandas as pd
import random
from datetime import datetime, timedelta

In [3]:
num_records = 100000
countries = ["India", "USA", "Germany", "UK", "Singapore"]
devices = ["Mobile", "Desktop", "Tablet"]
data = []
start_date = datetime(2025, 1, 1)
for i in range(num_records):
    record = {
        "timestamp": start_date + timedelta(minutes=i),
        "user_id": random.randint(1000, 5000),
        "transaction_amount": round(random.uniform(10, 5000), 2),
        "country": random.choice(countries),
        "device_type": random.choice(devices),
        "status": random.choice(["success"] * 9 + ["failure"])  # 10% failures
    }
    data.append(record)
df_pandas = pd.DataFrame(data)
df_pandas.to_csv("enterprise_logs.csv", index=False)
df_pandas.head()

Unnamed: 0,timestamp,user_id,transaction_amount,country,device_type,status
0,2025-01-01 00:00:00,4512,1447.53,USA,Tablet,success
1,2025-01-01 00:01:00,4465,3444.86,Singapore,Tablet,success
2,2025-01-01 00:02:00,1127,4280.54,UK,Mobile,success
3,2025-01-01 00:03:00,4802,3436.28,Singapore,Mobile,success
4,2025-01-01 00:04:00,3287,3199.63,Singapore,Tablet,success


In [4]:
spark = SparkSession.builder \
    .appName("Enterprise Log Analytics POC") \
    .getOrCreate()
spark

In [5]:
df = spark.read.csv("enterprise_logs.csv", header=True, inferSchema=True)
df.printSchema()
df.show(5)

root
 |-- timestamp: timestamp (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- transaction_amount: double (nullable = true)
 |-- country: string (nullable = true)
 |-- device_type: string (nullable = true)
 |-- status: string (nullable = true)

+-------------------+-------+------------------+---------+-----------+-------+
|          timestamp|user_id|transaction_amount|  country|device_type| status|
+-------------------+-------+------------------+---------+-----------+-------+
|2025-01-01 00:00:00|   4512|           1447.53|      USA|     Tablet|success|
|2025-01-01 00:01:00|   4465|           3444.86|Singapore|     Tablet|success|
|2025-01-01 00:02:00|   1127|           4280.54|       UK|     Mobile|success|
|2025-01-01 00:03:00|   4802|           3436.28|Singapore|     Mobile|success|
|2025-01-01 00:04:00|   3287|           3199.63|Singapore|     Tablet|success|
+-------------------+-------+------------------+---------+-----------+-------+
only showing top 5 rows


In [6]:
df = df.dropna()
df.count()

100000

In [7]:
failure_by_country = df.groupBy("country", "status").count()
failure_by_country.show()

+---------+-------+-----+
|  country| status|count|
+---------+-------+-----+
|  Germany|failure| 1952|
|Singapore|failure| 2008|
|      USA|success|18046|
|    India|success|18316|
|       UK|failure| 1994|
|    India|failure| 2051|
|      USA|failure| 1933|
|       UK|success|17891|
|  Germany|success|17918|
|Singapore|success|17891|
+---------+-------+-----+



In [8]:
avg_transaction = df.groupBy("device_type") \
    .agg(avg("transaction_amount").alias("avg_transaction_amount"))
avg_transaction.show()

+-----------+----------------------+
|device_type|avg_transaction_amount|
+-----------+----------------------+
|     Mobile|    2505.3312315403173|
|     Tablet|    2503.4319136599797|
|    Desktop|    2506.7431388202695|
+-----------+----------------------+



In [9]:
assembler = VectorAssembler(
    inputCols=["transaction_amount"],
    outputCol="features"
)
data = assembler.transform(df)
data.select("transaction_amount", "features").show(5)

+------------------+---------+
|transaction_amount| features|
+------------------+---------+
|           1447.53|[1447.53]|
|           3444.86|[3444.86]|
|           4280.54|[4280.54]|
|           3436.28|[3436.28]|
|           3199.63|[3199.63]|
+------------------+---------+
only showing top 5 rows


In [10]:
kmeans = KMeans(k=3, seed=42)
model = kmeans.fit(data)
predictions = model.transform(data)
predictions.select("transaction_amount", "prediction").show(10)

+------------------+----------+
|transaction_amount|prediction|
+------------------+----------+
|           1447.53|         0|
|           3444.86|         1|
|           4280.54|         1|
|           3436.28|         1|
|           3199.63|         2|
|           2351.77|         2|
|            703.61|         0|
|            600.41|         0|
|           4589.21|         1|
|           1797.52|         2|
+------------------+----------+
only showing top 10 rows


In [11]:
cluster_stats = predictions.groupBy("prediction") \
    .agg(avg("transaction_amount").alias("avg_amount"))
cluster_stats.show()

+----------+------------------+
|prediction|        avg_amount|
+----------+------------------+
|         1| 4180.062453123093|
|         2|2527.0304925104656|
|         0| 854.0949378109414|
+----------+------------------+



In [12]:
predictions.groupBy("prediction").count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|32852|
|         2|33380|
|         0|33768|
+----------+-----+



In [13]:
spark.stop()