In [None]:
import pandas as pd
from pyspark.sql import SparkSession

In [None]:
df = pd.read_csv('test01.csv')
df.head()

Unnamed: 0,Name,Age,Profession
0,Alice,30,Engineer
1,Bob,25,Data Scientist
2,Charlie,35,Teacher
3,Diana,28,Doctor
4,Ethan,40,Artist


In [None]:
# ---- 1️⃣ Install Java 17 + Spark 3.5.1 ----
!apt-get install openjdk-17-jdk-headless -qq > /dev/null
!wget -q -O spark.tgz https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz
!tar -xzf spark.tgz -C /content/
!pip install -q pyspark==3.5.1 findspark

# ---- 2️⃣ Set environment variables ----
import os, findspark
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.1-bin-hadoop3"
os.environ["HADOOP_USER_NAME"] = "root"
findspark.init("/content/spark-3.5.1-bin-hadoop3")

# ---- 3️⃣ Create Spark session ----
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Demo01") \
    .master("local[*]") \
    .getOrCreate()

print("✅ Spark session started successfully!")
spark

✅ Spark session started successfully!


In [None]:
print("Spark version:", spark.version)
spark.range(5).show()

Spark version: 3.5.1
+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



In [None]:
df_pyspark = spark.read.csv('test01.csv')

In [None]:
df_pyspark.show()

+-------+----+---------------+
|    _c0| _c1|            _c2|
+-------+----+---------------+
|   Name| Age|     Profession|
|  Alice|  30|       Engineer|
|    Bob|  25| Data Scientist|
|Charlie|  35|        Teacher|
|  Diana|  28|         Doctor|
|  Ethan|  40|         Artist|
|  Fiona|  32|         Lawyer|
| George|  29|      Architect|
| Hannah|  27|          Nurse|
|    Ian|  33|           Chef|
|   Jane|  31|         Writer|
+-------+----+---------------+



In [None]:
df_pyspark = spark.read.option('header','true').csv('test01.csv')

In [None]:
type(df_pyspark)

In [None]:
df_pyspark.head()

Row(Name='Alice',  Age=' 30',  Profession=' Engineer')

In [None]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |--  Age: string (nullable = true)
 |--  Profession: string (nullable = true)



In [None]:
df_pyspark.select("Name").show()
print(type(df_pyspark.select("Name")))

+-------+
|   Name|
+-------+
|  Alice|
|    Bob|
|Charlie|
|  Diana|
|  Ethan|
|  Fiona|
| George|
| Hannah|
|    Ian|
|   Jane|
+-------+

<class 'pyspark.sql.dataframe.DataFrame'>


In [None]:
df_pyspark.dtypes

[('Name', 'string'), (' Age', 'string'), (' Profession', 'string')]

In [None]:
df_pyspark.describe().show()

+-------+-----+-----------------+-----------+
|summary| Name|              Age| Profession|
+-------+-----+-----------------+-----------+
|  count|   10|               10|         10|
|   mean| NULL|             31.0|       NULL|
| stddev| NULL|4.320493798938574|       NULL|
|    min|Alice|               25|  Architect|
|    max| Jane|               40|     Writer|
+-------+-----+-----------------+-----------+



In [None]:
from pyspark.sql.functions import col

df_pyspark.withColumn(
    "Is_Adult",
    (col(" Age") >= 21)
).show()

+-------+----+---------------+--------+
|   Name| Age|     Profession|Is_Adult|
+-------+----+---------------+--------+
|  Alice|  30|       Engineer|    true|
|    Bob|  25| Data Scientist|    true|
|Charlie|  35|        Teacher|    true|
|  Diana|  28|         Doctor|    true|
|  Ethan|  40|         Artist|    true|
|  Fiona|  32|         Lawyer|    true|
| George|  29|      Architect|    true|
| Hannah|  27|          Nurse|    true|
|    Ian|  33|           Chef|    true|
|   Jane|  31|         Writer|    true|
+-------+----+---------------+--------+



In [None]:
df_pyspark.show()

+-------+----+---------------+
|   Name| Age|     Profession|
+-------+----+---------------+
|  Alice|  30|       Engineer|
|    Bob|  25| Data Scientist|
|Charlie|  35|        Teacher|
|  Diana|  28|         Doctor|
|  Ethan|  40|         Artist|
|  Fiona|  32|         Lawyer|
| George|  29|      Architect|
| Hannah|  27|          Nurse|
|    Ian|  33|           Chef|
|   Jane|  31|         Writer|
+-------+----+---------------+



Handling missing values

In [None]:
df_pyspark = spark.read.csv('test2.csv',header=True, inferSchema=True)

In [None]:
df_pyspark.show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [None]:
df_pyspark.na.drop(how="all").show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [None]:
df_pyspark.na.drop(how="any",thresh=2).show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
+---------+----+----------+------+



In [None]:
df_pyspark.na.drop(how="any",subset=['Experience']).show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
|     NULL| 34|        10| 38000|
+---------+---+----------+------+



In [None]:
# filling the missing values
df_pyspark.na.fill('Missing Value',['Experience', 'Salary']).show()

+---------+----+----------+------+
|     Name| age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         1| 15000|
|  Shubham|  23|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [None]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols=['age', 'Experience', 'Salary'],
    outputCols=["{}_imputed".format(c) for c in ['age', 'Experience', 'Salary']]
).setStrategy("mean")

In [None]:
imputer.fit(df_pyspark).transform(df_pyspark).show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| age|Experience|Salary|age_imputed|Experience_imputed|Salary_imputed|
+---------+----+----------+------+-----------+------------------+--------------+
|    Krish|  31|        10| 30000|         31|                10|         30000|
|Sudhanshu|  30|         8| 25000|         30|                 8|         25000|
|    Sunny|  29|         4| 20000|         29|                 4|         20000|
|     Paul|  24|         3| 20000|         24|                 3|         20000|
|   Harsha|  21|         1| 15000|         21|                 1|         15000|
|  Shubham|  23|         2| 18000|         23|                 2|         18000|
|   Mahesh|NULL|      NULL| 40000|         28|                 5|         40000|
|     NULL|  34|        10| 38000|         34|                10|         38000|
|     NULL|  36|      NULL|  NULL|         36|                 5|         25750|
+---------+----+----------+-

In [None]:
spark = SparkSession.builder.appName('dataframe').getOrCreate()

In [None]:
df_pyspark = spark.read.csv('test01.csv',header=True,inferSchema=True)

In [None]:
df_pyspark.show()

+-------+----+---------------+
|   Name| Age|     Profession|
+-------+----+---------------+
|  Alice|30.0|       Engineer|
|    Bob|25.0| Data Scientist|
|Charlie|35.0|        Teacher|
|  Diana|28.0|         Doctor|
|  Ethan|40.0|         Artist|
|  Fiona|32.0|         Lawyer|
| George|29.0|      Architect|
| Hannah|27.0|          Nurse|
|    Ian|33.0|           Chef|
|   Jane|31.0|         Writer|
+-------+----+---------------+



In [None]:
from pyspark.sql.functions import col

df_pyspark.filter(col(" Profession") == " Engineer").select("Name").show()

+-----+
| Name|
+-----+
|Alice|
+-----+



In [None]:
df_pyspark = spark.read.csv('test3.csv', header=True, inferSchema=True)

In [None]:
df_pyspark.show()

+---------+------------+------+
|     Name| Departments|salary|
+---------+------------+------+
|    Krish|Data Science| 10000|
|    Krish|         IOT|  5000|
|   Mahesh|    Big Data|  4000|
|    Krish|    Big Data|  4000|
|   Mahesh|Data Science|  3000|
|Sudhanshu|Data Science| 20000|
|Sudhanshu|         IOT| 10000|
|Sudhanshu|    Big Data|  5000|
|    Sunny|Data Science| 10000|
|    Sunny|    Big Data|  2000|
+---------+------------+------+



In [None]:
# GroupBy
df_pyspark.groupBy("Name").sum().show()

+---------+-----------+
|     Name|sum(salary)|
+---------+-----------+
|Sudhanshu|      35000|
|    Sunny|      12000|
|    Krish|      19000|
|   Mahesh|       7000|
+---------+-----------+



In [None]:
df_pyspark.groupBy('Departments').sum().show()
df_pyspark.groupBy('Departments').mean().show()
df_pyspark.groupBy('Departments').count().show()

+------------+-----------+
| Departments|sum(salary)|
+------------+-----------+
|         IOT|      15000|
|    Big Data|      15000|
|Data Science|      43000|
+------------+-----------+

+------------+-----------+
| Departments|avg(salary)|
+------------+-----------+
|         IOT|     7500.0|
|    Big Data|     3750.0|
|Data Science|    10750.0|
+------------+-----------+

+------------+-----+
| Departments|count|
+------------+-----+
|         IOT|    2|
|    Big Data|    4|
|Data Science|    4|
+------------+-----+



In [None]:
df_pyspark.agg({'Salary':'sum'}).count().sum()

AttributeError: 'int' object has no attribute 'sum'

In [None]:
[]