In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=dbcd08a3b6abd940b09f05efe2c10c7e53499d0e912a0d5cc4ac64df3e80f56d
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [2]:
from pyspark.sql import SparkSession
# Create a Spark session
spark = SparkSession.builder \
.appName("DataIngestion") \
.getOrCreate()

upload csv and json files


In [5]:
csv_file_path = "/content/sample_data/people.csv"

# Now you can read it with PySpark

df_csv = spark.read.format("csv").option("header", "true").load(csv_file_path)
df_csv.show()

+----+----+-------+
|Name| Age| Gender|
+----+----+-------+
|John|  28|   Male|
|Jane|  32| Female|
+----+----+-------+



In [6]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Define the schema for the JSON file
schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("gender", StringType(), True),
    StructField("address", StructType([
        StructField("street", StringType(), True),
        StructField("city", StringType(), True)
    ]), True)
])


# load the complex json file with the correct path
json_file_path = "/content/sample_data/sample.json"

df_json_complex = spark.read.schema(schema).json(json_file_path)

with open(json_file_path, "r") as f:
  data = f.read()
  print(data)

[
  {
    "name": "John",
    "age": 28,
    "gender": "Male",
    "address": {
      "street": "123 Main St",
      "city": "New York"
    }
  },
  {
    "name": "Jane",
    "age": 32,
    "gender": "Female",
    "address": {
      "street": "456 Elm St",
      "city": "San Francisco"
    }
  }
]


Temp View and Global Temp View


In [7]:
import pandas as pd
data = {
    "name": ["John", "Jane", "Mike", "Emily"],
    "age": [28, 32, 45, 23],
    "gender": ["Male", "Female", "Male", "Female"],
    "city": ["New York", "San Francisco", "Los Angeles", "Chicago"]
}

df = pd.DataFrame(data)

csv_file_path = "/content/sample_people.csv"
df.to_csv(csv_file_path, index=False)

print("csv file is created ")

csv file is created 


In [8]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("CreateViewExample").getOrCreate()
df_people = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(csv_file_path)
df_people.show()

+-----+---+------+-------------+
| name|age|gender|         city|
+-----+---+------+-------------+
| John| 28|  Male|     New York|
| Jane| 32|Female|San Francisco|
| Mike| 45|  Male|  Los Angeles|
|Emily| 23|Female|      Chicago|
+-----+---+------+-------------+



In [9]:
df_people.createOrReplaceTempView("people_temp_view")

In [10]:
#run an sql query on the view
result_temp_view = spark.sql("SELECT name, age, gender, city FROM people_temp_view WHERE age > 30")
result_temp_view.show()

+----+---+------+-------------+
|name|age|gender|         city|
+----+---+------+-------------+
|Jane| 32|Female|San Francisco|
|Mike| 45|  Male|  Los Angeles|
+----+---+------+-------------+



In [11]:
df_people.createOrReplaceGlobalTempView("people_global_view")
result_global_view = spark.sql("SELECT name, age, city FROM global_temp.people_global_view WHERE age < 30")
result_global_view.show()

+-----+---+--------+
| name|age|    city|
+-----+---+--------+
| John| 28|New York|
|Emily| 23| Chicago|
+-----+---+--------+



In [12]:
spark.catalog.listTables()

spark.catalog.dropTempView("people_temp_view")
spark.catalog.dropGlobalTempView("people_global_view")

True