In [1]:
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

In [2]:
df_transactions = spark.read \
    .format("csv") \
    .option("header","true") \
    .option("inferSchema","true").option("delimiter",";") \
    .load('./data/transactions-original.csv')

df_transactions.limit(5).show()

+------+-------+------------+----------------+-------+
|    id| amount|account_type|transaction_date|country|
+------+-------+------------+----------------+-------+
|179528|-730.86|    Business|      2013-07-10|     SV|
|378343|-946.98|    Personal|      2018-04-06|     YE|
| 75450|7816.92|Professional|      2016-11-20|     SI|
|357719| 704.02|    Business|      2016-11-06|     ID|
|110511| 3462.6|    Personal|      2018-01-18|     BS|
+------+-------+------------+----------------+-------+



In [23]:
quantiles = df_transactions.approxQuantile("amount", [0.5, 0.75], 0)

print(quantiles)

val_75_perc = quantiles[1]
val_50_perc = quantiles[0]

print(val_50_perc, val_75_perc)

[4500.12, 7247.27]
4500.12 7247.27


In [24]:
from pyspark.sql.functions import when

df_transactions \
    .select('id', 'amount', 'account_type', 'transaction_date', 'country') \
    .withColumn("level", when(df_transactions.amount < 4500.12, 'low') \
                        .when((df_transactions.amount >= 4500.12) & (df_transactions.amount < 7247.27), 'average') \
                        .otherwise('high')) \
    .show()

+------+-------+------------+----------------+-------+-------+
|    id| amount|account_type|transaction_date|country|  level|
+------+-------+------------+----------------+-------+-------+
|179528|-730.86|    Business|      2013-07-10|     SV|    low|
|378343|-946.98|    Personal|      2018-04-06|     YE|    low|
| 75450|7816.92|Professional|      2016-11-20|     SI|   high|
|357719| 704.02|    Business|      2016-11-06|     ID|    low|
|110511| 3462.6|    Personal|      2018-01-18|     BS|    low|
|461830| 762.81|Professional|      2017-06-20|     CN|    low|
| 30180|5390.24|Professional|      2021-05-26|     GN|average|
| 65398|4765.77|    Personal|      2018-05-01|     TR|average|
|170899|8775.89|    Business|      2013-10-16|     SK|   high|
|234300|8455.18|Professional|      2015-10-06|     LU|   high|
|208027| 6244.1|    Business|      2020-03-06|     AE|average|
|161212|5904.56|    Personal|      2016-09-07|     EG|average|
|105372|4079.76|Professional|      2015-02-12|     MT| 

In [27]:
def calculate_quantiles(df_transactions):
    from pyspark.sql.functions import when
    
    quantiles = df_transactions.approxQuantile("amount", [0.5, 0.75], 0) 
    
    val_75_perc = quantiles[1]
    val_50_perc = quantiles[0] 

    return df_transactions \
        .select('id', 'amount', 'account_type', 'transaction_date', 'country') \
        .withColumn("level", when(df_transactions.amount < val_50_perc, 'low') \
                        .when((df_transactions.amount >= val_50_perc) & (df_transactions.amount < val_75_perc), 'average') \
                        .otherwise('high'))


In [28]:
a = calculate_quantiles(df_transactions)

a.show()

+------+-------+------------+----------------+-------+-------+
|    id| amount|account_type|transaction_date|country|  level|
+------+-------+------------+----------------+-------+-------+
|179528|-730.86|    Business|      2013-07-10|     SV|    low|
|378343|-946.98|    Personal|      2018-04-06|     YE|    low|
| 75450|7816.92|Professional|      2016-11-20|     SI|   high|
|357719| 704.02|    Business|      2016-11-06|     ID|    low|
|110511| 3462.6|    Personal|      2018-01-18|     BS|    low|
|461830| 762.81|Professional|      2017-06-20|     CN|    low|
| 30180|5390.24|Professional|      2021-05-26|     GN|average|
| 65398|4765.77|    Personal|      2018-05-01|     TR|average|
|170899|8775.89|    Business|      2013-10-16|     SK|   high|
|234300|8455.18|Professional|      2015-10-06|     LU|   high|
|208027| 6244.1|    Business|      2020-03-06|     AE|average|
|161212|5904.56|    Personal|      2016-09-07|     EG|average|
|105372|4079.76|Professional|      2015-02-12|     MT| 