<a href="https://colab.research.google.com/github/vsubu1/PySpark_Tutorial/blob/main/PySpark_Tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("abcd").getOrCreate()


In [2]:
# Create data frame
df = spark.createDataFrame(
    [("leo",10),
     ("li",18),
     ("hue",56),
     ("bob",10)],
    ["name","age"]
)

In [3]:
df.show()

+----+---+
|name|age|
+----+---+
| leo| 10|
|  li| 18|
| hue| 56|
| bob| 10|
+----+---+



In [4]:
from pyspark.sql.functions import col,when


# create new column based on condition

df1 = df.withColumn(
         "life_stage",
         when(col("age") < 12, "child")
         .when(col("age").between (13,19), "teen")
        .otherwise ("adult"),
)

df1.show()




+----+---+----------+
|name|age|life_stage|
+----+---+----------+
| leo| 10|     child|
|  li| 18|      teen|
| hue| 56|     adult|
| bob| 10|     child|
+----+---+----------+



In [5]:
# filter column based on condition
df1.filter(col("life_stage").isin("child","adult")).show()

+----+---+----------+
|name|age|life_stage|
+----+---+----------+
| leo| 10|     child|
| hue| 56|     adult|
| bob| 10|     child|
+----+---+----------+



In [14]:
# filter column based on condition
df1.filter(col("age") == 10).show()

+----+---+----------+
|name|age|life_stage|
+----+---+----------+
| leo| 10|     child|
| bob| 10|     child|
+----+---+----------+



In [6]:
# Using aggregate function in data frame
from pyspark.sql.functions import avg

df1.select(avg("age")).show()

+--------+
|avg(age)|
+--------+
|    23.5|
+--------+



In [7]:
# SQL based aggregation
spark.sql("select avg(age) from {df1}",df1=df1).show()

+--------+
|avg(age)|
+--------+
|    23.5|
+--------+



In [8]:
# Apply group by function

df1.groupBy("life_stage").sum("age").show()

+----------+--------+
|life_stage|sum(age)|
+----------+--------+
|     child|      20|
|      teen|      18|
|     adult|      56|
+----------+--------+



In [9]:
# Group by through Spark SQL
spark.sql("select life_stage, avg(age) from {df1} group By life_stage", df1=df1).show()

+----------+--------+
|life_stage|avg(age)|
+----------+--------+
|     child|    10.0|
|      teen|    18.0|
|     adult|    56.0|
+----------+--------+



In [10]:
# Create table and select values from output table
df1.write.saveAsTable("testTable")

In [11]:
spark.sql("select * from testTable").show()

+----+---+----------+
|name|age|life_stage|
+----+---+----------+
| hue| 56|     adult|
| bob| 10|     child|
| leo| 10|     child|
|  li| 18|      teen|
+----+---+----------+



In [12]:
# Filter condition using Spark SQL.
spark.sql("select * from testTable where life_stage='adult'").show()

+----+---+----------+
|name|age|life_stage|
+----+---+----------+
| hue| 56|     adult|
+----+---+----------+



In [96]:
# Read CSV file
text_file = spark.read.csv("file:/content/category.csv",
                           header="true",
                           inferSchema="true",
                           sep='|')

text_file.show()


+-----------+-----------+-------------------+
|CATEGORY_ID|       NAME|        LAST_UPDATE|
+-----------+-----------+-------------------+
|          1|     Action|2006-02-15 04:46:27|
|          2|  Animation|2006-02-15 04:46:27|
|          3|   Children|2006-02-15 04:46:27|
|          4|   Classics|2006-02-15 04:46:27|
|          5|     Comedy|2006-02-15 04:46:27|
|          6|Documentary|2006-02-15 04:46:27|
|          7|      Drama|2006-02-15 04:46:27|
|          8|     Family|2006-02-15 04:46:27|
|          9|    Foreign|2006-02-15 04:46:27|
|         10|      Games|2006-02-15 04:46:27|
|         11|     Horror|2006-02-15 04:46:27|
|         12|      Music|2006-02-15 04:46:27|
|         13|        New|2006-02-15 04:46:27|
|         14|     Sci-Fi|2006-02-15 04:46:27|
|         15|     Sports|2006-02-15 04:46:27|
|         16|     Travel|2006-02-15 04:46:27|
|         17|      Drama|2006-02-15 04:46:27|
|         18|     Family|2006-02-15 04:46:27|
|         19|    Foreign|2006-02-1

In [103]:
# prompt: find the number of occurrences of each word present in the text_file dataframe

from pyspark.sql.functions import explode, split, lower, regexp_replace

# Split the "category_name" column into words and explode them into separate rows
words_df = text_file.select(col("NAME"))

words_df.show()

# Group by word and count the occurrences
word_counts = words_df.groupBy("NAME").count()

# Show the results
word_counts.show()

+-----------+
|       NAME|
+-----------+
|     Action|
|  Animation|
|   Children|
|   Classics|
|     Comedy|
|Documentary|
|      Drama|
|     Family|
|    Foreign|
|      Games|
|     Horror|
|      Music|
|        New|
|     Sci-Fi|
|     Sports|
|     Travel|
|      Drama|
|     Family|
|    Foreign|
|      Games|
+-----------+
only showing top 20 rows

+-----------+-----+
|       NAME|count|
+-----------+-----+
|    Foreign|    2|
|     Sports|    1|
|      Drama|    2|
|Documentary|    1|
|     Travel|    1|
|     Family|    2|
|      Games|    2|
|   Classics|    1|
|  Animation|    1|
|      Music|    2|
|     Horror|    2|
|        New|    1|
|     Comedy|    1|
|   Children|    1|
|     Action|    1|
|     Sci-Fi|    1|
+-----------+-----+



+-----------+-----+
|       word|count|
+-----------+-----+
|     travel|    1|
|      scifi|    1|
|documentary|    1|
|        new|    1|
|     action|    1|
|  animation|    1|
|    foreign|    1|
|     family|    1|
|     horror|    1|
|      games|    1|
|      music|    1|
|      drama|    1|
|     sports|    1|
|   children|    1|
|   classics|    1|
|     comedy|    1|
+-----------+-----+



In [97]:

# prompt: Find number of occurances in file using explode function

words_df = text_file.select(col("CATEGORY_ID"),regexp_replace(col("NAME"), "[^a-zA-Z\\s]", "").alias("word"))
words_df.show()

# Filter out empty strings that might result from the splitting
words_df = words_df.filter(col("word") != "")
words_df.show()


words_df = words_df.select(col("CATEGORY_ID"),split(col("word"),"\\s+").alias("word"))
words_df.show()
words_df = words_df.select(col("CATEGORY_ID"),explode(col("word")).alias("explode"))
words_df.show()
words_df.groupBy("explode").count().show()





+-----------+-----------+
|CATEGORY_ID|       word|
+-----------+-----------+
|          1|     Action|
|          2|  Animation|
|          3|   Children|
|          4|   Classics|
|          5|     Comedy|
|          6|Documentary|
|          7|      Drama|
|          8|     Family|
|          9|    Foreign|
|         10|      Games|
|         11|     Horror|
|         12|      Music|
|         13|        New|
|         14|      SciFi|
|         15|     Sports|
|         16|     Travel|
|         17|      Drama|
|         18|     Family|
|         19|    Foreign|
|         20|      Games|
+-----------+-----------+
only showing top 20 rows

+-----------+-----------+
|CATEGORY_ID|       word|
+-----------+-----------+
|          1|     Action|
|          2|  Animation|
|          3|   Children|
|          4|   Classics|
|          5|     Comedy|
|          6|Documentary|
|          7|      Drama|
|          8|     Family|
|          9|    Foreign|
|         10|      Games|
|         11

In [113]:
# prompt: Find number of occurances in file using explode function

words_df = text_file.select(col("CATEGORY_ID"),col("NAME"))
words_df.show()


words_df = words_df.select(col("CATEGORY_ID"),split(col("NAME"),"\\s+").alias("word"))
words_df.show()
words_df = words_df.select(col("CATEGORY_ID"),explode(col("word")).alias("explode"))
words_df.show()
words_df.groupBy("explode").count().show()


+-----------+-----------+
|CATEGORY_ID|       NAME|
+-----------+-----------+
|          1|     Action|
|          2|  Animation|
|          3|   Children|
|          4|   Classics|
|          5|     Comedy|
|          6|Documentary|
|          7|      Drama|
|          8|     Family|
|          9|    Foreign|
|         10|      Games|
|         11|     Horror|
|         12|      Music|
|         13|        New|
|         14|     Sci-Fi|
|         15|     Sports|
|         16|     Travel|
|         17|      Drama|
|         18|     Family|
|         19|    Foreign|
|         20|      Games|
+-----------+-----------+
only showing top 20 rows

+-----------+-------------+
|CATEGORY_ID|         word|
+-----------+-------------+
|          1|     [Action]|
|          2|  [Animation]|
|          3|   [Children]|
|          4|   [Classics]|
|          5|     [Comedy]|
|          6|[Documentary]|
|          7|      [Drama]|
|          8|     [Family]|
|          9|    [Foreign]|
|         10| 