# This notebook explains how to create DataFrames using Pyspark

### Init Spark session

In [None]:
from notebook_data.common import init_spark

spark, sc = init_spark.create()

### Verify the spark session

In [None]:
spark

In [None]:
df = spark.read.option("header", True).csv("w05/data/laptop_prices.csv")

## Create a Spark DataFrames from CSV file
1. Option: enable header
2. Cache before use

In [None]:
df = spark.read.option("header", True).csv("w05/data/laptop_prices.csv").cache()

### Show top 10 lines of the DataFrames

In [None]:
df.show(10)

### Count number of lines in the DataFrames

In [None]:
df.count()

### Check if the DataFrames is cached?

In [None]:
df.is_cached

### Explore the schema

In [None]:
df.printSchema()

## Create a new DataFrames from an existing one

In [None]:
df_apple = df.filter("Company = 'Apple'")

### Explain the DataFrames to see how it is executed

In [None]:
df_apple.explain(extended=False)

In [None]:
df_apple.show(10)

In [None]:
df_apple.count()

In [None]:
df_apple.is_cached

## Create a DataFrames from a sequence / list
We need to prepare:
1. A list of items
2. A schema for the dataframe

In [None]:
l = [(i,) for i in range(1000)]

The schema is defined with the following format:
`<column name> <data type>`
<br />
For example:
- `id int, name string, phone_numbers array<int>`

In [None]:
df_l = spark.createDataFrame(l, 'id int')

In [None]:
df_l.explain(extended=True)

In [None]:
df_l.show(10)

In [None]:
df_l.is_cached

### Another example with different style of schema definition

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType, LongType
import random

In [None]:
# Define the schema
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("phone_number", ArrayType(LongType()), True),
    StructField("age", IntegerType(), True),
    StructField("country", StringType(), True)
])

In [None]:
schema.json()

In [None]:
schema.simpleString()

In [None]:
# Generate random data
data = []
names = ["Alice", "Bob", "Charlie", "David", "Eva", "Frank", "Grace", "Hannah", "Ian", "Jack"]

for i in range(1000):
    id = i + 1  # id starts from 1 to 1000
    name = random.choice(names)
    phone_number = [random.randint(1000000000, 9999999999) for _ in range(random.randint(1, 3))]  # 1 to 3 phone numbers
    age = random.randint(18, 70)  # age between 18 and 70
    country = random.choice(["USA", "Canada", "UK", "Germany", "France"])  # random country
    data.append((id, name, phone_number, age, country))

In [None]:
data

In [None]:
df_people = spark.createDataFrame(data, schema)

In [None]:
df_people.show(10)

In [None]:
df_people.printSchema()

In [None]:
df_people.explain(extended=True)

In [None]:
df_people.is_cached