In [1]:
!pip install pyspark
from pyspark.sql import SparkSession

# Create Spark session
spark = SparkSession.builder.appName("FileTypesExample").getOrCreate()




In [4]:
# Load text file
txt_df = spark.read.text("/example.txt")  # Ensure example.txt is uploaded

# Print schema
txt_df.printSchema()

# Show data
txt_df.show(truncate=False)

# Check data types
print("TXT Schema:", txt_df.dtypes)


root
 |-- value: string (nullable = true)

+---------------------------------------------------------------------------+
|value                                                                      |
+---------------------------------------------------------------------------+
|Spark SQL was first released in Spark 1.0 (May, 2014).                     |
|Initial committed by Michael Armbrust & Reynold Xin from Databricks.       |
|Spark introduces a programming module for structured data processing called|
|Spark SQL.                                                                 |
+---------------------------------------------------------------------------+

TXT Schema: [('value', 'string')]


In [5]:
csv_df = spark.read.csv("/example.csv", header=True, inferSchema=True)  # Ensure example.csv is uploaded

# Print schema
csv_df.printSchema()

# Show data
csv_df.show()

# Check data types
print("CSV Schema:", csv_df.dtypes)


root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- City: string (nullable = true)

+-------+---+------+-----------+
|   Name|Age|Gender|       City|
+-------+---+------+-----------+
|  Alice| 34|  Male|   New York|
|    Bob| 45|  Male|Los Angeles|
|Shreeja| 29|Female|  Bengaluru|
|Lavanya| 25|Female|       Pune|
+-------+---+------+-----------+

CSV Schema: [('Name', 'string'), ('Age', 'int'), ('Gender', 'string'), ('City', 'string')]


In [15]:
with open("/employee.json", "w") as f:
    f.write('{"name": "Alice", "age": 30, "city": "New York"}\n')
    f.write('{"name": "Bob", "age": 25, "city": "Los Angeles"}\n')

In [16]:
json_df = spark.read.json("/employee.json")
json_df.printSchema()
json_df.show()
print("JSON Schema:", json_df.dtypes)


root
 |-- age: long (nullable = true)
 |-- city: string (nullable = true)
 |-- name: string (nullable = true)

+---+-----------+-----+
|age|       city| name|
+---+-----------+-----+
| 30|   New York|Alice|
| 25|Los Angeles|  Bob|
+---+-----------+-----+

JSON Schema: [('age', 'bigint'), ('city', 'string'), ('name', 'string')]
