In [17]:
import findspark
findspark.init()

In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.streaming import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import os

spark = SparkSession.builder.master("local").appName("Optimise2").getOrCreate()
sc = spark.sparkContext

In [19]:
# sql adaptive query execution adaptive.coalescePartitions.enabled will makr paritions dynamic
sc.setLogLevel("Error")
spark.conf.set("spark.sql.shuffle.partitions",3)
spark.conf.get("spark.sql.shuffle.partitions")
spark.conf.set("spark.sql.adaptive.enabled","false")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled","false")
spark.conf.set("spark.sql.execution.arrow.enabled",True)
spark.conf.set("spark.sql.execution.arrow.fallback.enabled",True)

"""
Calculate sum(credit) - sum(debit)

Method1: create a coulmn with -1*debit amount and do sum
Method2: using pivot and transpose columns - More appropriate
"""

#WithoutInferSchema
#headerTrue will read first row and assign column names but type is String for all
#spark job created to read first column
filepath = "file:///C:/Users/venka/PycharmProjects/pythonProject/dataset/"
df = spark.read.option("header",True) \
            .option("inferSchema",True) \
            .option("delimiter",",") \
            .csv(filepath + "IntPersonal_transactions.csv")

In [20]:
df.show()

+-----------+-------------+---------+-------------------+----------------+-------+
|Customer_No|    Card_type|     Date|           Category|Transaction Type| Amount|
+-----------+-------------+---------+-------------------+----------------+-------+
|    1000501|Platinum Card| 1/1/2018|           Shopping|           debit|  11.11|
|    1000501|     Checking| 1/2/2018|    Mortgage & Rent|           debit|1247.44|
|    1000501|  Silver Card| 1/2/2018|        Restaurants|           debit|  24.22|
|    1000501|Platinum Card| 1/3/2018|Credit Card Payment|          credit|2298.09|
|    1000501|Platinum Card| 1/4/2018|      Movies & DVDs|           debit|  11.76|
|    1000501|  Silver Card| 1/5/2018|        Restaurants|           debit|  25.85|
|    1000501|  Silver Card| 1/6/2018|   Home Improvement|           debit|  18.45|
|    1000501|     Checking| 1/8/2018|          Utilities|           debit|   45.0|
|    1000501|  Silver Card| 1/8/2018|   Home Improvement|           debit|  15.38|
|   

In [23]:
df1 = df.withColumn("amt_chk",when(col("Transaction Type") =='debit', -1 * col("Amount")) \
                                .otherwise(col("Amount")))

In [24]:
df1.show()

+-----------+-------------+---------+-------------------+----------------+-------+--------+
|Customer_No|    Card_type|     Date|           Category|Transaction Type| Amount| amt_chk|
+-----------+-------------+---------+-------------------+----------------+-------+--------+
|    1000501|Platinum Card| 1/1/2018|           Shopping|           debit|  11.11|  -11.11|
|    1000501|     Checking| 1/2/2018|    Mortgage & Rent|           debit|1247.44|-1247.44|
|    1000501|  Silver Card| 1/2/2018|        Restaurants|           debit|  24.22|  -24.22|
|    1000501|Platinum Card| 1/3/2018|Credit Card Payment|          credit|2298.09| 2298.09|
|    1000501|Platinum Card| 1/4/2018|      Movies & DVDs|           debit|  11.76|  -11.76|
|    1000501|  Silver Card| 1/5/2018|        Restaurants|           debit|  25.85|  -25.85|
|    1000501|  Silver Card| 1/6/2018|   Home Improvement|           debit|  18.45|  -18.45|
|    1000501|     Checking| 1/8/2018|          Utilities|           debit|   45.

In [25]:
df2 = df1.groupBy("customer_No").agg(sum("amt_chk").alias("tot_bal"))

In [26]:
df2.show()

+-----------+------------------+
|customer_No|           tot_bal|
+-----------+------------------+
|    1000531|1657.1499999999999|
|    1000501|           2720.32|
|    1000654| 798.1199999999999|
|    1001863|           2069.64|
|    1001368|2155.7200000000003|
|    1002324|           1601.49|
|    1000210|1705.1399999999999|
+-----------+------------------+



In [27]:
df3=df1.groupBy("customer_no").pivot("Transaction Type").agg(sum("Amount"))

In [29]:
df3.show()

+-----------+------------------+------------------+
|customer_no|            credit|             debit|
+-----------+------------------+------------------+
|    1000531|2864.7999999999997|           1207.65|
|    1000501|           4298.09|           1577.77|
|    1002324|           2761.59|1160.1000000000001|
|    1000210|           2559.91|            854.77|
|    1000654|           2299.27|           1501.15|
|    1001863|           2765.37|            695.73|
|    1001368|           3403.55|1247.8300000000002|
+-----------+------------------+------------------+



In [31]:
df3.withColumn("tot_bal",col("credit")-col("debit")).drop("credit","debit").show()

+-----------+------------------+
|customer_no|           tot_bal|
+-----------+------------------+
|    1000531|1657.1499999999996|
|    1000501|           2720.32|
|    1002324|           1601.49|
|    1000210|1705.1399999999999|
|    1000654| 798.1199999999999|
|    1001863|           2069.64|
|    1001368|2155.7200000000003|
+-----------+------------------+

