In [1]:
# Install Spark and dependencies (only once per session)
!wget -q https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
!tar xf spark-3.5.0-bin-hadoop3.tgz
!pip install -q findspark

# Set environment variables
import os
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

# Initialize findspark
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("OnlineBankingAnalysis").getOrCreate()

In [4]:
from google.colab import files
uploaded = files.upload()


Saving credit card.csv to credit card (1).csv
Saving loan.csv to loan (1).csv
Saving txn.csv to txn (1).csv


In [6]:
loan_df = spark.read.option("header", True).option("inferSchema", True).csv("/content/loan.csv")
credit_df = spark.read.option("header", True).option("inferSchema", True).csv("/content/credit card.csv")
txn_df = spark.read.option("header", True).option("inferSchema", True).csv("/content/txn.csv")

In [7]:
from pyspark.sql.functions import col, count, max, min, sum
loan_df.show(n=20, truncate=True, vertical=False)

+-----------+---+------+-------------------+--------------+-----------+------+-----------+-------------+------------------+-----------+-------+------------+----------------+------------------+
|Customer_ID|Age|Gender|         Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|     Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|
+-----------+---+------+-------------------+--------------+-----------+------+-----------+-------------+------------------+-----------+-------+------------+----------------+------------------+
|    IB14001| 30|  MALE|       BANK MANAGER|        SINGLE|          4| 50000|      22199|            6|           HOUSING| 10,00,000 |      5|      42,898|               6|                 9|
|    IB14008| 44|  MALE|          PROFESSOR|       MARRIED|          6| 51000|      19999|            4|          SHOPPING|     50,000|      3|      33,999|               1|                 5|
|    IB14012| 30|FEMALE|           

In [8]:
loan_df.printSchema()

root
 |-- Customer_ID: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Marital Status: string (nullable = true)
 |-- Family Size: integer (nullable = true)
 |-- Income: integer (nullable = true)
 |-- Expenditure: integer (nullable = true)
 |-- Use Frequency: integer (nullable = true)
 |-- Loan Category: string (nullable = true)
 |-- Loan Amount: string (nullable = true)
 |-- Overdue: integer (nullable = true)
 |--  Debt Record: string (nullable = true)
 |--  Returned Cheque: integer (nullable = true)
 |--  Dishonour of Bill: integer (nullable = true)



In [9]:
loan_df.dtypes

[('Customer_ID', 'string'),
 ('Age', 'int'),
 ('Gender', 'string'),
 ('Occupation', 'string'),
 ('Marital Status', 'string'),
 ('Family Size', 'int'),
 ('Income', 'int'),
 ('Expenditure', 'int'),
 ('Use Frequency', 'int'),
 ('Loan Category', 'string'),
 ('Loan Amount', 'string'),
 ('Overdue', 'int'),
 (' Debt Record', 'string'),
 (' Returned Cheque', 'int'),
 (' Dishonour of Bill', 'int')]

In [10]:
loan_df = loan_df.withColumn("Loan Amount", col("Loan Amount").cast("int"))

In [11]:
# Number of loans in each category
loan_df.groupBy("Loan Category").count().show()

+------------------+-----+
|     Loan Category|count|
+------------------+-----+
|           HOUSING|   67|
|        TRAVELLING|   53|
|       BOOK STORES|    7|
|       AGRICULTURE|   12|
|         GOLD LOAN|   77|
|  EDUCATIONAL LOAN|   20|
|        AUTOMOBILE|   60|
|          BUSINESS|   24|
|COMPUTER SOFTWARES|   35|
|           DINNING|   14|
|          SHOPPING|   35|
|       RESTAURANTS|   41|
|       ELECTRONICS|   14|
|          BUILDING|    7|
|        RESTAURANT|   20|
|   HOME APPLIANCES|   14|
+------------------+-----+



In [12]:
# People who took more than ₹1L loan
loan_df.filter(col("Loan Amount") > 100000).count()

0

In [13]:
# People with income > ₹60,000
loan_df.filter(col("Income") > 60000).count()

198

In [14]:
# ≥2 returned cheques and income < ₹50,000
loan_df.filter((col(" Returned Cheque") >= 2) & (col("Income") < 50000)).count()

137

In [15]:
# ≥2 returned cheques and Single
loan_df.filter((col(" Returned Cheque") >= 2) & (col("Marital Status") == "Single")).count()

0

In [16]:
# Expenditure over ₹50,000
loan_df.filter(col("Expenditure") > 50000).count()

6

In [17]:
credit_df.show(n=20, truncate=True, vertical=False)

+---------+----------+---------+-----------+---------+------+---+------+---------+-------------+--------------+---------------+------+
|RowNumber|CustomerId|  Surname|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|IsActiveMember|EstimatedSalary|Exited|
+---------+----------+---------+-----------+---------+------+---+------+---------+-------------+--------------+---------------+------+
|        1|  15634602| Hargrave|        619|   France|Female| 42|     2|      0.0|            1|             1|      101348.88|     1|
|        2|  15647311|     Hill|        608|    Spain|Female| 41|     1| 83807.86|            1|             1|      112542.58|     0|
|        3|  15619304|     Onio|        502|   France|Female| 42|     8| 159660.8|            3|             0|      113931.57|     1|
|        4|  15701354|     Boni|        699|   France|Female| 39|     1|      0.0|            2|             0|       93826.63|     0|
|        5|  15737888| Mitchell|        850|    Spain|F

In [18]:
credit_df.printSchema()

root
 |-- RowNumber: integer (nullable = true)
 |-- CustomerId: integer (nullable = true)
 |-- Surname: string (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- Balance: double (nullable = true)
 |-- NumOfProducts: integer (nullable = true)
 |-- IsActiveMember: integer (nullable = true)
 |-- EstimatedSalary: double (nullable = true)
 |-- Exited: integer (nullable = true)



In [19]:
credit_df.dtypes

[('RowNumber', 'int'),
 ('CustomerId', 'int'),
 ('Surname', 'string'),
 ('CreditScore', 'int'),
 ('Geography', 'string'),
 ('Gender', 'string'),
 ('Age', 'int'),
 ('Tenure', 'int'),
 ('Balance', 'double'),
 ('NumOfProducts', 'int'),
 ('IsActiveMember', 'int'),
 ('EstimatedSalary', 'double'),
 ('Exited', 'int')]

In [20]:
# Users from Spain
credit_df.filter(col("Geography") == "Spain").count()


2477

In [21]:
# Eligible & Active (e.g., CreditScore > 650 and IsActiveMember = 1)
credit_df.filter((col("CreditScore") > 650) & (col("IsActiveMember") == 1)).count()

2655

In [22]:
txn_df.show(n=20, truncate=True, vertical=False)

+-------------+--------------------+----------+----------------+-------------+-----------+
|   Account No| TRANSACTION DETAILS|VALUE DATE| WITHDRAWAL AMT | DEPOSIT AMT |BALANCE AMT|
+-------------+--------------------+----------+----------------+-------------+-----------+
|409000611074'|TRF FROM  Indiafo...| 29-Jun-17|            NULL|    1000000.0|  1000000.0|
|409000611074'|TRF FROM  Indiafo...|  5-Jul-17|            NULL|    1000000.0|  2000000.0|
|409000611074'|FDRL/INTERNAL FUN...| 18-Jul-17|            NULL|     500000.0|  2500000.0|
|409000611074'|TRF FRM  Indiafor...|  1-Aug-17|            NULL|    3000000.0|  5500000.0|
|409000611074'|FDRL/INTERNAL FUN...| 16-Aug-17|            NULL|     500000.0|  6000000.0|
|409000611074'|FDRL/INTERNAL FUN...| 16-Aug-17|            NULL|     500000.0|  6500000.0|
|409000611074'|FDRL/INTERNAL FUN...| 16-Aug-17|            NULL|     500000.0|  7000000.0|
|409000611074'|FDRL/INTERNAL FUN...| 16-Aug-17|            NULL|     500000.0|  7500000.0|

In [23]:
txn_df.printSchema()

root
 |-- Account No: string (nullable = true)
 |-- TRANSACTION DETAILS: string (nullable = true)
 |-- VALUE DATE: string (nullable = true)
 |--  WITHDRAWAL AMT : double (nullable = true)
 |--  DEPOSIT AMT : double (nullable = true)
 |-- BALANCE AMT: double (nullable = true)



In [24]:
txn_df.dtypes

[('Account No', 'string'),
 ('TRANSACTION DETAILS', 'string'),
 ('VALUE DATE', 'string'),
 (' WITHDRAWAL AMT ', 'double'),
 (' DEPOSIT AMT ', 'double'),
 ('BALANCE AMT', 'double')]

In [25]:
# Max withdrawal amount
txn_df.select(max(" WITHDRAWAL AMT ")).show()

+---------------------+
|max( WITHDRAWAL AMT )|
+---------------------+
|        4.594475464E8|
+---------------------+



In [26]:
# Min withdrawal per account
txn_df.groupBy("Account No").agg(min(" WITHDRAWAL AMT ").alias("Min Withdrawal")).show()


+-------------+--------------+
|   Account No|Min Withdrawal|
+-------------+--------------+
|409000438611'|           0.2|
|     1196711'|          0.25|
|     1196428'|          0.25|
|409000493210'|          0.01|
|409000611074'|         120.0|
|409000425051'|          1.25|
|409000405747'|          21.0|
|409000362497'|          0.97|
|409000493201'|           2.1|
|409000438620'|          0.34|
+-------------+--------------+



In [27]:
# Max deposit per account
txn_df.groupBy("Account No").agg(max(" DEPOSIT AMT ").alias("Max Deposit")).show()


+-------------+-------------+
|   Account No|  Max Deposit|
+-------------+-------------+
|409000438611'|     1.7025E8|
|     1196711'|        5.0E8|
|     1196428'|2.119594422E8|
|409000493210'|        1.5E7|
|409000611074'|    3000000.0|
|409000425051'|        1.5E7|
|409000405747'|      2.021E8|
|409000362497'|        2.0E8|
|409000493201'|    1000000.0|
|409000438620'|      5.448E8|
+-------------+-------------+



In [28]:
# Total balance per account
txn_df.groupBy("Account No").agg(sum("BALANCE AMT").alias("Total Balance")).show()


+-------------+--------------------+
|   Account No|       Total Balance|
+-------------+--------------------+
|409000438611'|-2.49486577068339...|
|     1196711'|-1.60476498101275E13|
|     1196428'| -8.1418498130721E13|
|409000493210'|-3.27584952132095...|
|409000611074'|       1.615533622E9|
|409000425051'|-3.77211841164998...|
|409000405747'|-2.43108047067000...|
|409000362497'| -5.2860004792808E13|
|409000493201'|1.0420831829499985E9|
|409000438620'|-7.12291867951358...|
+-------------+--------------------+



In [29]:
# Number of transactions per date
txn_df.groupBy("VALUE DATE").agg(count("*").alias("Transaction Count")).show()


+----------+-----------------+
|VALUE DATE|Transaction Count|
+----------+-----------------+
| 23-Dec-16|              143|
|  7-Feb-19|               98|
| 21-Jul-15|               80|
|  9-Sep-15|               91|
| 17-Jan-15|               16|
| 18-Nov-17|               53|
| 21-Feb-18|               77|
| 20-Mar-18|               71|
| 19-Apr-18|               71|
| 21-Jun-16|               97|
| 17-Oct-17|              101|
|  3-Jan-18|               70|
|  8-Jun-18|              223|
| 15-Dec-18|               62|
|  8-Aug-16|               97|
| 17-Dec-16|               74|
|  3-Sep-15|               83|
| 21-Jan-16|               76|
|  4-May-18|               92|
|  7-Sep-17|               94|
+----------+-----------------+
only showing top 20 rows



In [30]:
# Customers who withdrew > ₹1L
txn_df.filter(col(" WITHDRAWAL AMT ") > 100000).select("Account No").distinct().show()

+-------------+
|   Account No|
+-------------+
|409000438611'|
|     1196711'|
|     1196428'|
|409000493210'|
|409000611074'|
|409000425051'|
|409000405747'|
|409000362497'|
|409000493201'|
|409000438620'|
+-------------+

