In [14]:
from pyspark.sql import SparkSession

In [31]:
from pyspark.sql import functions
from pyspark.sql.types import IntegerType

In [16]:
spark = SparkSession.builder.master("local[2]").appName("DataSet").getOrCreate()

In [17]:
filepath = "../../data/users_data.csv"
dataframe = spark.read.format("csv").options(path=filepath,header=True,inferSchema = True,delimiter = ",").load()

In [18]:
dataframe.show()

+---+---------+----------+------+-----+---------------+------------+------------+-------+---------------+
| id|user_name|    mobile|salary|bonus|user_created_at|        addr|        city|country|address_created|
+---+---------+----------+------+-----+---------------+------------+------------+-------+---------------+
|  1|   taukir|8010339935| 50000|10000|     17-11-2023|    Richmond|         B.C| Canada|     17-11-2023|
|  1|   taukir|8010339935| 50000|12000|     17-11-2023|natesh puram|      Meerut|  India|     17-11-2023|
|  2|     khan|8010223369| 40000|15000|     17-11-2023|Khhan Market|       Delhi|  India|     17-11-2023|
|  3|      Sab|  96301452| 35000|10000|     17-11-2023|Sadar Market|      Kanpur|  India|     17-11-2023|
|  4|      sam| 963201458| 25061|12000|     17-11-2023|   Street 16|Los angeles |     US|     17-11-2023|
|  5|   zeenat|   1560489| 46000| 5000|     17-11-2023|    Richmond|         B.C| Canada|     17-11-2023|
|  5|   zeenat|   1560489| 46000| 7000|     17

## Count

In [6]:
# dataframe.count()
# dataframe.filter(dataframe.country == 'India').count()

# Distinct

In [19]:
dataframe.distinct().count()

7

# Drop Duplicate

In [20]:
dataframe.dropDuplicates().orderBy('id').show(2)

+---+---------+----------+------+-----+---------------+------------+------+-------+---------------+
| id|user_name|    mobile|salary|bonus|user_created_at|        addr|  city|country|address_created|
+---+---------+----------+------+-----+---------------+------------+------+-------+---------------+
|  1|   taukir|8010339935| 50000|12000|     17-11-2023|natesh puram|Meerut|  India|     17-11-2023|
|  1|   taukir|8010339935| 50000|10000|     17-11-2023|    Richmond|   B.C| Canada|     17-11-2023|
+---+---------+----------+------+-----+---------------+------------+------+-------+---------------+
only showing top 2 rows



# Sorting/OrderBy

In [21]:
# dataframe.sort(dataframe.salary).show()
# dataframe.sort(dataframe.id,dataframe.salary).show()
# dataframe.sort(dataframe.country).show()
dataframe.sort(dataframe.id.asc(),dataframe.salary.desc()).show(2)

+---+---------+----------+------+-----+---------------+------------+------+-------+---------------+
| id|user_name|    mobile|salary|bonus|user_created_at|        addr|  city|country|address_created|
+---+---------+----------+------+-----+---------------+------------+------+-------+---------------+
|  1|   taukir|8010339935| 50000|10000|     17-11-2023|    Richmond|   B.C| Canada|     17-11-2023|
|  1|   taukir|8010339935| 50000|12000|     17-11-2023|natesh puram|Meerut|  India|     17-11-2023|
+---+---------+----------+------+-----+---------------+------------+------+-------+---------------+
only showing top 2 rows



In [10]:
dataframe.orderBy(dataframe.id.asc(),dataframe.salary.desc()).show(2)

+---+---------+----------+------+---------------+----------+--------+-------+---------------+
| id|user_name|    mobile|salary|user_created_at|      addr|    city|country|address_created|
+---+---------+----------+------+---------------+----------+--------+-------+---------------+
|  1|   taukir|8010339935| 50000|     2023-11-17|  Richmond|     B.C| Canada|     2023-11-17|
|  1|   taukir|8010339935| 50000|          10000|17-11-2023|Richmond|    B.C|         Canada|
+---+---------+----------+------+---------------+----------+--------+-------+---------------+
only showing top 2 rows



# GroupBy

In [11]:
dataframe.show(3)

+---+---------+----------+------+---------------+------------+------+-------+---------------+
| id|user_name|    mobile|salary|user_created_at|        addr|  city|country|address_created|
+---+---------+----------+------+---------------+------------+------+-------+---------------+
|  1|   taukir|8010339935| 50000|     2023-11-17|    Richmond|   B.C| Canada|     2023-11-17|
|  1|   taukir|8010339935| 50000|     2023-11-17|natesh puram|Meerut|  India|     2023-11-17|
|  2|     khan|8010223369| 40000|     2023-11-17|Khhan Market| Delhi|  India|     2023-11-17|
+---+---------+----------+------+---------------+------------+------+-------+---------------+
only showing top 3 rows



In [22]:
# dataframe.groupBy('country').count().show()
# dataframe.groupBy('country').sum('salary').show()
# dataframe.groupBy('country').max('salary').show()
# dataframe.groupBy('country').min('salary').show()
dataframe.groupBy('country').avg('salary').show()

+-------+-----------+
|country|avg(salary)|
+-------+-----------+
|  India|    42750.0|
|     US|    25061.0|
| Canada|    48000.0|
+-------+-----------+



In [23]:
dataframe.groupBy('country','id').agg(
    functions.count("*").alias("total_count"),
    functions.sum("salary").alias("sum_sal"),
    functions.max("salary").alias("max_sal"),
    functions.sum(functions.expr("salary + mobile" )).alias("max_sal")
).show()

+-------+---+-----------+-------+-------+----------+
|country| id|total_count|sum_sal|max_sal|   max_sal|
+-------+---+-----------+-------+-------+----------+
|  India|  2|          1|  40000|  40000|8010263369|
|     US|  4|          1|  25061|  25061| 963226519|
| Canada|  5|          1|  46000|  46000|   1606489|
|  India|  3|          1|  35000|  35000|  96336452|
|  India|  5|          1|  46000|  46000|   1606489|
|  India|  1|          1|  50000|  50000|8010389935|
| Canada|  1|          1|  50000|  50000|8010389935|
+-------+---+-----------+-------+-------+----------+



In [54]:
# dataframe.filter(dataframe.salary > 30000).groupBy('country').agg(functions.count("*").alias("cnt")).show()
# dataframe.filter(dataframe.salary > 30000).groupBy('country').agg(
#     functions.count("*").alias("cnt")).where(dataframe.cnt > 4).show() # we get Error as cnd not attribute

dataframe.filter(dataframe.salary > 30000).groupBy('country').agg(
    functions.count("*").alias("cnt")).where(functions.col('cnt') > 2).show()  # it will work with col as it is within context

+-------+---+
|country|cnt|
+-------+---+
|  India|  4|
+-------+---+



## UDF

In [35]:
def total_salary(salary, bonus):
    return (salary + bonus)
total_salary_udf = functions.udf(lambda x,y:total_salary(x,y), IntegerType())

In [37]:
dataframe2 = dataframe.withColumn("total_salary",total_salary_udf(dataframe.salary,dataframe.bonus))

In [39]:
dataframe2.show(3)

+---+---------+----------+------+-----+---------------+------------+------+-------+---------------+------------+
| id|user_name|    mobile|salary|bonus|user_created_at|        addr|  city|country|address_created|total_salary|
+---+---------+----------+------+-----+---------------+------------+------+-------+---------------+------------+
|  1|   taukir|8010339935| 50000|10000|     17-11-2023|    Richmond|   B.C| Canada|     17-11-2023|       60000|
|  1|   taukir|8010339935| 50000|12000|     17-11-2023|natesh puram|Meerut|  India|     17-11-2023|       62000|
|  2|     khan|8010223369| 40000|15000|     17-11-2023|Khhan Market| Delhi|  India|     17-11-2023|       55000|
+---+---------+----------+------+-----+---------------+------------+------+-------+---------------+------------+
only showing top 3 rows



In [44]:
## Second Appraoch
dataframe.withColumn("total_salary",functions.expr("salary + bonus" )).show(3)

+---+---------+----------+------+-----+---------------+------------+------+-------+---------------+------------+
| id|user_name|    mobile|salary|bonus|user_created_at|        addr|  city|country|address_created|total_salary|
+---+---------+----------+------+-----+---------------+------------+------+-------+---------------+------------+
|  1|   taukir|8010339935| 50000|10000|     17-11-2023|    Richmond|   B.C| Canada|     17-11-2023|       60000|
|  1|   taukir|8010339935| 50000|12000|     17-11-2023|natesh puram|Meerut|  India|     17-11-2023|       62000|
|  2|     khan|8010223369| 40000|15000|     17-11-2023|Khhan Market| Delhi|  India|     17-11-2023|       55000|
+---+---------+----------+------+-----+---------------+------------+------+-------+---------------+------------+
only showing top 3 rows



## Cache

In [55]:
df2 = dataframe.filter(dataframe.salary > 30000).groupBy('country').agg(functions.count("*").alias("cnt"))
# where(functions.col('cnt') > 2).show()

In [56]:
df2.cache()

DataFrame[country: string, cnt: bigint]

In [57]:
df2.where(functions.col('cnt') > 2).show()

+-------+---+
|country|cnt|
+-------+---+
|  India|  4|
+-------+---+

