## Creating DataFrame
A DataFrame is a distributed dataset comprising data arranged in rows and columns with named attributes.


In [0]:

# Create DataFrame
data = [
  ('Aarav', 'Kumar', 'Patel', '1993-08-14', 'M', 5500),
  ('Diya', 'Rani', 'Sharma', '1998-03-22', 'F', 6200),
  ('Karan', '', 'Mehta', '1985-11-09', 'M', 7200),
  ('Isha', 'Vikram', 'Nair', '1990-07-30', 'F', 6800),
  ('Rahul', 'Dev', 'Singh', '2001-01-12', 'M', 4800),
  ('Priya', '', 'Menon', '1995-05-04', 'F', 7000)
]

columns = ["firstname", "middlename", "lastname", "dob", "gender", "salary"]
df = spark.createDataFrame(data=data, schema=columns)

# To get the schema of the DataFrame
df.printSchema()


In [0]:
# Spark DataFrame method that prints the data in a tabular text format directly to the console
# Shows the 20 rows by default

df.show() 

In [0]:
# Databricks-specific command (not part of PySpark) that shows the DataFrame in an interactive table UI

display(df)

## Creating dataframes with nested schema

In [0]:
# Create nested DataFrame data

from pyspark.sql.types import StructType,StructField, StringType, IntegerType

dataDF = [
  (('Aarav', 'Kumar', 'Patel'), '1993-08-14', 'M', 5500),
  (('Diya', 'Rani', ''), '1998-03-22', 'F', 6200),
  (('Karan', '', 'Mehta'), '1985-11-09', 'M', 7200),
  (('Isha', 'Vikram', 'Nair'), '1990-07-30', 'F', 6800),
  (('Rahul', 'Dev', 'Singh'), '2001-01-12', 'M', 4800)
]

schema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('dob', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

df = spark.createDataFrame(data = dataDF, schema = schema)
df.printSchema()
df.show(truncate=False)
