In [0]:
# Create sample data with an array column containing nulls
data = [
    (1, ["Apple", "Banana", "Orange"]),
    (2, ["Grapes", None, "Watermelon"]),
    (3, ["Pineapple", "Mango", None]),
    (4,None)
]

# Create DataFrame with array column and null values
df = spark.createDataFrame(data, ["id", "fruits"])

In [0]:
display(df)
df.printSchema()


id,fruits
1,"List(Apple, Banana, Orange)"
2,"List(Grapes, null, Watermelon)"
3,"List(Pineapple, Mango, null)"
4,


root
 |-- id: long (nullable = true)
 |-- fruits: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [0]:
from pyspark.sql.functions import explode

display(df.select(df.id,explode(df.fruits)))

id,col
1,Apple
1,Banana
1,Orange
2,Grapes
2,
2,Watermelon
3,Pineapple
3,Mango
3,


In [0]:
from pyspark.sql.functions import explode_outer
display(df.select(df.id,explode_outer(df.fruits)))

id,col
1,Apple
1,Banana
1,Orange
2,Grapes
2,
2,Watermelon
3,Pineapple
3,Mango
3,
4,


In [0]:
data = [
    (1, {"name": "Alice", "age": 25}),
    (2, {"name": "Bob", "age": None}),
    (3, {"name": None, "age": 30}),
    (4,None)
]

# Create DataFrame with MapType column and null values
df2 = spark.createDataFrame(data, ["id", "person"])

In [0]:
display(df2)

id,person
1,"Map(name -> Alice, age -> 25)"
2,"Map(name -> Bob, age -> null)"
3,"Map(name -> null, age -> 30)"
4,


In [0]:
display(df2.select(df2.id,explode(df2.person)))

id,key,value
1,name,Alice
1,age,25
2,name,Bob
2,age,
3,name,
3,age,30


In [0]:
from pyspark.sql.functions import explode_outer
display(df2.select(df2.id,explode_outer(df2.person)))

id,key,value
1,name,Alice
1,age,25
2,name,Bob
2,age,
3,name,
3,age,30
4,,


In [0]:
from pyspark.sql.functions import posexplode
display(df.select(df.id,posexplode(df.fruits)))
display(df2.select(df2.id,posexplode(df2.person)))

id,pos,col
1,0,Apple
1,1,Banana
1,2,Orange
2,0,Grapes
2,1,
2,2,Watermelon
3,0,Pineapple
3,1,Mango
3,2,


id,pos,key,value
1,0,name,Alice
1,1,age,25
2,0,name,Bob
2,1,age,
3,0,name,
3,1,age,30


In [0]:
# Create sample data
data = [
    ("Alice", 25),
    ("Bob", 30),
    ("Charlie", 35),
    ("Dave", 40),
    ("Eve", 45)
]

# Create DataFrame
df3= spark.createDataFrame(data, ["name", "age"])

In [0]:
from pyspark.sql.functions import col, when

display(df3)

name,age
Alice,25
Bob,30
Charlie,35
Dave,40
Eve,45


In [0]:
from pyspark.sql.functions import col, when

display(df3.withColumn("age_status", when(col("age")<=30,'Young')
                    .when(col("age")>35,"milldle age")
                    .otherwise("Unknown")))

name,age,age_status
Alice,25,Young
Bob,30,Young
Charlie,35,Unknown
Dave,40,milldle age
Eve,45,milldle age


In [0]:

import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()

simpleData = [("James","Sales","NY",90000,34,10000), \
    ("Michael","Sales","NY",86000,56,20000), \
    ("Robert","Sales","CA",81000,30,23000), \
    ("Maria","Finance","CA",90000,24,23000) \
  ]

columns= ["employee_name","department","state","salary","age","bonus"]
dfunion = spark.createDataFrame(data = simpleData, schema = columns)
dfunion.printSchema()
dfunion.show(truncate=False)


root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
+-------------+----------+-----+------+---+-----+



In [0]:

simpleData2 = [("James","Sales","NY",90000,34,10000), \
    ("Maria","Finance","CA",90000,24,23000), \
    ("Jen","Finance","NY",79000,53,15000), \
    ("Jeff","Marketing","CA",80000,25,18000), \
    ("Kumar","Marketing","NY",91000,50,21000) \
  ]
columns2= ["employee_name","department","state","salary","age","bonus"]

dfunion2 = spark.createDataFrame(data = simpleData2, schema = columns2)

dfunion2.printSchema()
dfunion2.show(truncate=False)


root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



In [0]:
uniondf=dfunion.union(dfunion2)
display(uniondf)

employee_name,department,state,salary,age,bonus
James,Sales,NY,90000,34,10000
Michael,Sales,NY,86000,56,20000
Robert,Sales,CA,81000,30,23000
Maria,Finance,CA,90000,24,23000
James,Sales,NY,90000,34,10000
Maria,Finance,CA,90000,24,23000
Jen,Finance,NY,79000,53,15000
Jeff,Marketing,CA,80000,25,18000
Kumar,Marketing,NY,91000,50,21000


In [0]:
uniondf=dfunion.union(dfunion2).distinct()
display(uniondf)

employee_name,department,state,salary,age,bonus
James,Sales,NY,90000,34,10000
Michael,Sales,NY,86000,56,20000
Robert,Sales,CA,81000,30,23000
Maria,Finance,CA,90000,24,23000
Jen,Finance,NY,79000,53,15000
Jeff,Marketing,CA,80000,25,18000
Kumar,Marketing,NY,91000,50,21000


In [0]:

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr
#Create spark session
data = [("Banana",1000,"USA"), ("Carrots",1500,"USA"), ("Beans",1600,"USA"), \
      ("Orange",2000,"USA"),("Orange",2000,"USA"),("Banana",400,"China"), \
      ("Carrots",1200,"China"),("Beans",1500,"China"),("Orange",4000,"China"), \
      ("Banana",2000,"Canada"),("Carrots",2000,"Canada"),("Beans",2000,"Mexico")]

columns= ["Product","Amount","Country"]
dfpivot = spark.createDataFrame(data = data, schema = columns)
dfpivot.printSchema()
dfpivot.show(truncate=False)


root
 |-- Product: string (nullable = true)
 |-- Amount: long (nullable = true)
 |-- Country: string (nullable = true)

+-------+------+-------+
|Product|Amount|Country|
+-------+------+-------+
|Banana |1000  |USA    |
|Carrots|1500  |USA    |
|Beans  |1600  |USA    |
|Orange |2000  |USA    |
|Orange |2000  |USA    |
|Banana |400   |China  |
|Carrots|1200  |China  |
|Beans  |1500  |China  |
|Orange |4000  |China  |
|Banana |2000  |Canada |
|Carrots|2000  |Canada |
|Beans  |2000  |Mexico |
+-------+------+-------+



In [0]:
pivotdf=dfpivot.groupBy('product').pivot("country").sum('Amount')
display(pivotdf)

product,Canada,China,Mexico,USA
Orange,,4000,,4000
Beans,,1500,2000.0,1600
Banana,2000.0,400,,1000
Carrots,2000.0,1200,,1500


In [0]:

from pyspark.sql.functions import expr
unpivotExpr = "stack(3, 'Canada', Canada, 'China', China, 'Mexico', Mexico) as (Country,Total)"
unPivotDF = pivotdf.select("Product", expr(unpivotExpr)) \
    .where("Total is not null")
unPivotDF.show(truncate=False)
unPivotDF.show()

+-------+-------+-----+
|Product|Country|Total|
+-------+-------+-----+
|Orange |China  |4000 |
|Beans  |China  |1500 |
|Beans  |Mexico |2000 |
|Banana |Canada |2000 |
|Banana |China  |400  |
|Carrots|Canada |2000 |
|Carrots|China  |1200 |
+-------+-------+-----+

+-------+-------+-----+
|Product|Country|Total|
+-------+-------+-----+
| Orange|  China| 4000|
|  Beans|  China| 1500|
|  Beans| Mexico| 2000|
| Banana| Canada| 2000|
| Banana|  China|  400|
|Carrots| Canada| 2000|
|Carrots|  China| 1200|
+-------+-------+-----+



In [0]:
dbutils.fs.mkdirs("dbfs:/dbfs/FileStore/tables/interview_practice")

Out[20]: True

In [0]:
dfmodes=spark.read.csv("dbfs:/FileStore/croourtrecords.csv",header=True,mode="DROPMALFORMED")
display(dfmodes)

id,name,age
1,John,25
2,Alice,30
3,Bob,35
4,Sarah,InvalidAge
5,Michael,40


In [0]:
dfmodes = spark.read.format("csv") \
    .option("mode", "DROPMALFORMED") \
    .option("header", "true") \
    .load("dbfs:/FileStore/croourtrecords.csv")

display(dfmodes)

id,name,age
1,John,25
2,Alice,30
3,Bob,35
4,Sarah,InvalidAge
5,Michael,40


In [0]:
dfmodes.orderBy('age').show()

+---+-------+----------+
| id|   name|       age|
+---+-------+----------+
|  1|   John|        25|
|  2|  Alice|        30|
|  3|    Bob|        35|
|  5|Michael|        40|
|  4|  Sarah|InvalidAge|
+---+-------+----------+



In [0]:
data = [("John", 25, "USA"),
        ("Jane", 30, "USA"),
        ("John", 25, "USA"),
        ("Alice", 35, "Canada"),
        ("Bob", 30, "USA")]

# Create the DataFrame
dfduplicates = spark.createDataFrame(data, ["Name", "Age", "Country"])

# Display the original DataFrame

dfduplicates.show()

+-----+---+-------+
| Name|Age|Country|
+-----+---+-------+
| John| 25|    USA|
| Jane| 30|    USA|
| John| 25|    USA|
|Alice| 35| Canada|
|  Bob| 30|    USA|
+-----+---+-------+



In [0]:
dfduplicates.distinct().show()

+-----+---+-------+
| Name|Age|Country|
+-----+---+-------+
| John| 25|    USA|
| Jane| 30|    USA|
|Alice| 35| Canada|
|  Bob| 30|    USA|
+-----+---+-------+



In [0]:
dfduplicates.dropDuplicates(['Country']).show()

+-----+---+-------+
| Name|Age|Country|
+-----+---+-------+
|Alice| 35| Canada|
| John| 25|    USA|
+-----+---+-------+



In [0]:
data = [("John", "USA", 100),
        ("Jane", "USA", 200),
        ("John", "Canada", 150),
        ("Alice", "USA", 300),
        ("Bob", "Canada", 250)]

# Create the DataFrame
dfgroupby = spark.createDataFrame(data, ["Name", "Country", "Sales"])

In [0]:
display(dfgroupby.groupBy('Country').sum('Sales'))

Country,sum(Sales)
USA,600
Canada,400


In [0]:
display(dfgroupby.groupBy('Country').min('Sales'))

Country,min(Sales)
USA,100
Canada,150


In [0]:
display(dfgroupby.groupBy('Country').max('Sales'))

Country,max(Sales)
USA,300
Canada,250


In [0]:
display(dfgroupby.groupBy('Country').count())

Country,count
USA,3
Canada,2


In [0]:
from pyspark.sql.functions import count,min,max

dfgroupby.groupBy('Country').agg(count('*').alias("count"),\
                                         min('Sales').alias("min"),\
                                         max('sales').alias("max")).show()

+-------+-----+---+---+
|Country|count|min|max|
+-------+-----+---+---+
|    USA|    3|100|300|
| Canada|    2|150|250|
+-------+-----+---+---+



In [0]:
# Create the first DataFrame
data1 = [("John", 25),
         ("Jane", 30),
         ("Alice", 28)]

df1 = spark.createDataFrame(data1, ["Name", "Age"])

# Create the second DataFrame
data2 = [("Bob", 1000),
         ("Mary", 2000),
         ("Peter",3000)]

df2 = spark.createDataFrame(data2, ["Name", "Salary"])

In [0]:
df1.show()
df2.show()

+-----+---+
| Name|Age|
+-----+---+
| John| 25|
| Jane| 30|
|Alice| 28|
+-----+---+

+-----+------+
| Name|Salary|
+-----+------+
|  Bob|  1000|
| Mary|  2000|
|Peter|  3000|
+-----+------+



In [0]:
df1.unionByName(df2).show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-4465714466998865>:1[0m
[0;32m----> 1[0m [43mdf1[49m[38;5;241;43m.[39;49m[43munionByName[49m[43m([49m[43mdf2[49m[43m)[49m[38;5;241m.[39mshow()

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kwargs)[0m
[1;32m     46[0m start [38;5;241m=[39m time[38;5;241m.[39mperf_counter()
[1;32m     47[0m [38;5;28;01mtry[39;00m:
[0;32m---> 48[0m     res [38;5;241m=[39m [43mfunc[49m[43m([49m[38;5;241;43m*[39;49m[43margs[49m[43m,[49m[43m [49m[38;5;241;43m*[39;49m[38;5;241;43m*[39;49m[43mkwargs[49m[43m)[49m
[1;32m     49[0m     logger[38;5;241m.[39mlog_success(
[1;32m     50[0m         module_name, class_name, function_name, time[38;5;241m.[39mperf_counter() [38;5;2

In [0]:
df1.unionByName(df2,allowMissingColumns=True).show()

+-----+----+------+
| Name| Age|Salary|
+-----+----+------+
| John|  25|  null|
| Jane|  30|  null|
|Alice|  28|  null|
|  Bob|null|  1000|
| Mary|null|  2000|
|Peter|null|  3000|
+-----+----+------+



In [0]:
from pyspark.sql.functions import col
df2.select('Name','Salary').show()
df2.select(df2.Name).show()
df2.select(df2['Name']).show()
df2.select(['Name','Salary']).show()
df2.select('*').show()
df2.select([col for col in df2.columns]).show()


+-----+------+
| Name|Salary|
+-----+------+
|  Bob|  1000|
| Mary|  2000|
|Peter|  3000|
+-----+------+

+-----+
| Name|
+-----+
|  Bob|
| Mary|
|Peter|
+-----+

+-----+
| Name|
+-----+
|  Bob|
| Mary|
|Peter|
+-----+

+-----+------+
| Name|Salary|
+-----+------+
|  Bob|  1000|
| Mary|  2000|
|Peter|  3000|
+-----+------+

+-----+------+
| Name|Salary|
+-----+------+
|  Bob|  1000|
| Mary|  2000|
|Peter|  3000|
+-----+------+



In [0]:
df2.select([col for col in df2.columns]).show()

+-----+------+
| Name|Salary|
+-----+------+
|  Bob|  1000|
| Mary|  2000|
|Peter|  3000|
+-----+------+



In [0]:
data1=[(1,'varun',2000,2),(2,'teja',3000,1),(3,'kamal',4000,4)]
schema1=['id','Name','salary','dep']

data2=[(1,'IT'),(2,'HR'),(3,'Payroll')]
schema2=['id','depname']

empdf1=spark.createDataFrame(data1,schema1)
depdf1=spark.createDataFrame(data2,schema2)
empdf1.show()
depdf1.show()

+---+-----+------+---+
| id| Name|salary|dep|
+---+-----+------+---+
|  1|varun|  2000|  2|
|  2| teja|  3000|  1|
|  3|kamal|  4000|  4|
+---+-----+------+---+

+---+-------+
| id|depname|
+---+-------+
|  1|     IT|
|  2|     HR|
|  3|Payroll|
+---+-------+



In [0]:
depdf1 = depdf1.withColumn("new_id", col("id"))
display(depdf1)


id,depname,new_id
1,IT,1
2,HR,2
3,Payroll,3


In [0]:
depdf1=depdf1.drop('new_id')
display(depdf1)

id,depname
1,IT
2,HR
3,Payroll


In [0]:
depdf1 = depdf1.withColumnRenamed("id", "dep_id")

display(depdf1)

dep_id,depname
1,IT
2,HR
3,Payroll


In [0]:
empdf1.join(depdf1,empdf1.dep==depdf1.dep_id,'inner').show()

+---+-----+------+---+------+-------+
| id| Name|salary|dep|dep_id|depname|
+---+-----+------+---+------+-------+
|  2| teja|  3000|  1|     1|     IT|
|  1|varun|  2000|  2|     2|     HR|
+---+-----+------+---+------+-------+



In [0]:
empdf1.join(depdf1,empdf1.dep==depdf1.dep_id,'inner').show()
empdf1.join(depdf1,empdf1.dep==depdf1.dep_id,'left').show()
empdf1.join(depdf1,empdf1.dep==depdf1.dep_id,'right').show()
empdf1.join(depdf1,empdf1.dep==depdf1.dep_id,'full_outer').show()
empdf1.join(depdf1,empdf1.dep==depdf1.dep_id,'left_semi').show()
empdf1.join(depdf1,empdf1.dep==depdf1.dep_id,'left_anti').show()

+---+-----+------+---+------+-------+
| id| Name|salary|dep|dep_id|depname|
+---+-----+------+---+------+-------+
|  2| teja|  3000|  1|     1|     IT|
|  1|varun|  2000|  2|     2|     HR|
+---+-----+------+---+------+-------+

+---+-----+------+---+------+-------+
| id| Name|salary|dep|dep_id|depname|
+---+-----+------+---+------+-------+
|  1|varun|  2000|  2|     2|     HR|
|  2| teja|  3000|  1|     1|     IT|
|  3|kamal|  4000|  4|  null|   null|
+---+-----+------+---+------+-------+

+----+-----+------+----+------+-------+
|  id| Name|salary| dep|dep_id|depname|
+----+-----+------+----+------+-------+
|   2| teja|  3000|   1|     1|     IT|
|   1|varun|  2000|   2|     2|     HR|
|null| null|  null|null|     3|Payroll|
+----+-----+------+----+------+-------+

+----+-----+------+----+------+-------+
|  id| Name|salary| dep|dep_id|depname|
+----+-----+------+----+------+-------+
|   2| teja|  3000|   1|     1|     IT|
|   1|varun|  2000|   2|     2|     HR|
|null| null|  null|null

In [0]:
columns = ["id", "name", "age", "salary", "department", "city", "state", "country", "phone", "email"]

# Create a list of data rows
data = [
    (1, "John", 30, None, "IT", "New York", "NY", "USA", None, "john@example.com"),
    (2, "Alice", None, 5000, "HR", "London", None, "UK", "1234567890", "alice@example.com"),
    (3, "Bob", 45, 8000, "Finance", "Sydney", "NSW", None, "9876543210", None),
    # Add more rows as needed
]

# Create the DataFrame
dffillna = spark.createDataFrame(data, columns)


In [0]:
display(dffillna)
dffillna.printSchema()

id,name,age,salary,department,city,state,country,phone,email
1,John,30.0,,IT,New York,NY,USA,,john@example.com
2,Alice,,5000.0,HR,London,,UK,1234567890.0,alice@example.com
3,Bob,45.0,8000.0,Finance,Sydney,NSW,,9876543210.0,


root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- salary: long (nullable = true)
 |-- department: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- email: string (nullable = true)



In [0]:
dffillna.fillna("Unknown").show()


+---+-----+----+------+----------+--------+-------+-------+----------+-----------------+
| id| name| age|salary|department|    city|  state|country|     phone|            email|
+---+-----+----+------+----------+--------+-------+-------+----------+-----------------+
|  1| John|  30|  null|        IT|New York|     NY|    USA|   Unknown| john@example.com|
|  2|Alice|null|  5000|        HR|  London|Unknown|     UK|1234567890|alice@example.com|
|  3|  Bob|  45|  8000|   Finance|  Sydney|    NSW|Unknown|9876543210|          Unknown|
+---+-----+----+------+----------+--------+-------+-------+----------+-----------------+

+---+-----+----+------+----------+--------+-----+-------+----------+-----------------+
| id| name| age|salary|department|    city|state|country|     phone|            email|
+---+-----+----+------+----------+--------+-----+-------+----------+-----------------+
|  1| John|  30|  null|        IT|New York|   NY|    USA|      null| john@example.com|
|  2|Alice|null|  5000|     

In [0]:
dffillna.fillna({"salary": 0}).show()


dffillna.fill('unknown',['age','state']).show()

+---+-----+----+------+----------+--------+-----+-------+----------+-----------------+
| id| name| age|salary|department|    city|state|country|     phone|            email|
+---+-----+----+------+----------+--------+-----+-------+----------+-----------------+
|  1| John|  30|     0|        IT|New York|   NY|    USA|      null| john@example.com|
|  2|Alice|null|  5000|        HR|  London| null|     UK|1234567890|alice@example.com|
|  3|  Bob|  45|  8000|   Finance|  Sydney|  NSW|   null|9876543210|             null|
+---+-----+----+------+----------+--------+-----+-------+----------+-----------------+

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/databricks/python/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3378, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<command-290257265153432>", line 4, in <module>
    dffillna.fill('unknown',['age','state']).show()
  File "/databricks/spark/python/pyspark/instrumentation_utils.py", line 48, in wrapper
    res = func(*args, **kwargs)
  File "/databricks/spark/python/pyspark/sql/dataframe.py", line 2964, in __getattr__
    raise AttributeError(
AttributeError: 'DataFrame' object has no attribute 'fill'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/databricks/python/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 1997, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "/databricks/python/lib/python3.9/site-packages/IPython/core/ultratb.py", line 1112, in structured_traceback
    retur



In [0]:
dffillna.na.fill('unkown',['state','phone']).show()

+---+-----+----+------+----------+--------+------+-------+----------+-----------------+
| id| name| age|salary|department|    city| state|country|     phone|            email|
+---+-----+----+------+----------+--------+------+-------+----------+-----------------+
|  1| John|  30|  null|        IT|New York|    NY|    USA|    unkown| john@example.com|
|  2|Alice|null|  5000|        HR|  London|unkown|     UK|1234567890|alice@example.com|
|  3|  Bob|  45|  8000|   Finance|  Sydney|   NSW|   null|9876543210|             null|
+---+-----+----+------+----------+--------+------+-------+----------+-----------------+



In [0]:
listrows=dffillna.collect()
print(listrows)
print(listrows[0])

[Row(id=1, name='John', age=30, salary=None, department='IT', city='New York', state='NY', country='USA', phone=None, email='john@example.com'), Row(id=2, name='Alice', age=None, salary=5000, department='HR', city='London', state=None, country='UK', phone='1234567890', email='alice@example.com'), Row(id=3, name='Bob', age=45, salary=8000, department='Finance', city='Sydney', state='NSW', country=None, phone='9876543210', email=None)]
Row(id=1, name='John', age=30, salary=None, department='IT', city='New York', state='NY', country='USA', phone=None, email='john@example.com')


In [0]:
print(listrows[0][0])


1


In [0]:
display(depdf1)

dep_id,depname
1,IT
2,HR
3,Payroll


In [0]:
from pyspark.sql.functions import upper

def convertToUpper(depdf1):
    return depdf1.withColumn('depname',upper(depdf1.depname))

In [0]:
depdf1.show()

+------+-------+
|dep_id|depname|
+------+-------+
|     1|     IT|
|     2|     HR|
|     3|Payroll|
+------+-------+

