In [1]:
# Install Java & Spark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.4.1/spark-3.4.1-bin-hadoop3.tgz
!tar xf spark-3.4.1-bin-hadoop3.tgz
!pip install -q findspark

# Set environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.4.1-bin-hadoop3"

# Initialize SparkSession
import findspark
findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("PySparkOnColab").getOrCreate()
spark


In [2]:
data=[(1, 'manish', 'India', 10000),(2, 'Ranish', 'USA', 50000),(3, 'rani', 'UK', 5000),(4, 'sohan', 'UK', 25000),(5, 'Mona', 'India', 2000)]
columns= ['id', 'name', 'country', 'salary']
df=spark.createDataFrame(data=data, schema=columns)
df.show()

+---+------+-------+------+
| id|  name|country|salary|
+---+------+-------+------+
|  1|manish|  India| 10000|
|  2|Ranish|    USA| 50000|
|  3|  rani|     UK|  5000|
|  4| sohan|     UK| 25000|
|  5|  Mona|  India|  2000|
+---+------+-------+------+



In [6]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
df.withColumn('row_number', row_number().over(Window.orderBy('salary'))).show()


+---+------+-------+------+----------+
| id|  name|country|salary|row_number|
+---+------+-------+------+----------+
|  5|  Mona|  India|  2000|         1|
|  3|  rani|     UK|  5000|         2|
|  1|manish|  India| 10000|         3|
|  4| sohan|     UK| 25000|         4|
|  2|Ranish|    USA| 50000|         5|
+---+------+-------+------+----------+



In [7]:
data=[("2022-03-15","2022-03-16 12:34:56.789"),
      ("2022-03-01","2022-03-16 01:34:56.798")

]
columns=['date_column','timestamp_column']
df=spark.createDataFrame(data=data, schema=columns)
df.show()

+-----------+--------------------+
|date_column|    timestamp_column|
+-----------+--------------------+
| 2022-03-15|2022-03-16 12:34:...|
| 2022-03-01|2022-03-16 01:34:...|
+-----------+--------------------+



In [9]:
from pyspark.sql.functions import*
df.select("date_column", date_format("date_column", "yyyy/MM/dd").alias("formatted_date")).show()

+-----------+--------------+
|date_column|formatted_date|
+-----------+--------------+
| 2022-03-15|    2022/03/15|
| 2022-03-01|    2022/03/01|
+-----------+--------------+



Date Add Function


In [10]:
from pyspark.sql.functions import*
df.select("date_column", date_add("date_column", 10).alias("date_after_10_days")).show()

+-----------+------------------+
|date_column|date_after_10_days|
+-----------+------------------+
| 2022-03-15|        2022-03-25|
| 2022-03-01|        2022-03-11|
+-----------+------------------+



Date Subtract


In [14]:
df1=df.select("date_column", date_sub("date_column", 10).alias("date_before_10_days"))

In [15]:
df1.show()

+-----------+-------------------+
|date_column|date_before_10_days|
+-----------+-------------------+
| 2022-03-15|         2022-03-05|
| 2022-03-01|         2022-02-19|
+-----------+-------------------+



Date Difference

In [16]:
df1.select("date_column", "date_before_10_days", datediff("date_column", "date_before_10_days").alias("days_difference")).show()

+-----------+-------------------+---------------+
|date_column|date_before_10_days|days_difference|
+-----------+-------------------+---------------+
| 2022-03-15|         2022-03-05|             10|
| 2022-03-01|         2022-02-19|             10|
+-----------+-------------------+---------------+



In [17]:
df1.select("date_column", year("date_column")).show()

+-----------+-----------------+
|date_column|year(date_column)|
+-----------+-----------------+
| 2022-03-15|             2022|
| 2022-03-01|             2022|
+-----------+-----------------+



In [19]:
df1.select("date_column", dayofyear("date_column")).show()

+-----------+----------------------+
|date_column|dayofyear(date_column)|
+-----------+----------------------+
| 2022-03-15|                    74|
| 2022-03-01|                    60|
+-----------+----------------------+



Explode


In [20]:
data=[
    ("Manish",["Java","C","C++"]),
    ("Arun Sir",["Spark", "Java", "C++"]),
    ("Shyam", ["Sharp", "C"])



]

columns=["teacher", "courses"]
df=spark.createDataFrame(data, columns)
df.show()


+--------+------------------+
| teacher|           courses|
+--------+------------------+
|  Manish|    [Java, C, C++]|
|Arun Sir|[Spark, Java, C++]|
|   Shyam|        [Sharp, C]|
+--------+------------------+



In [22]:
from pyspark.sql.functions import*
df.select("teacher", explode(col("courses"))).show()

+--------+-----+
| teacher|  col|
+--------+-----+
|  Manish| Java|
|  Manish|    C|
|  Manish|  C++|
|Arun Sir|Spark|
|Arun Sir| Java|
|Arun Sir|  C++|
|   Shyam|Sharp|
|   Shyam|    C|
+--------+-----+

