In [1]:
data = """OrderID,OrderDate,Customer,Segment,Region,Product,Category,SubCategory,Quantity,UnitPrice,Discount,Profit
CA-1001,2023-01-15,Ravi,Consumer,South,Laptop,Technology,Computers,1,55000,0.10,5000
CA-1002,2023-02-20,Priya,Corporate,North,Printer,Technology,Peripherals,2,12000,0.15,1800
CA-1003,2023-01-25,Amit,Consumer,East,Notebook,Office Supplies,Paper,3,200,0.05,150
CA-1004,2023-03-01,Anita,Home Office,West,Table,Furniture,Tables,1,18000,0.20,-1500
CA-1005,2023-02-05,Divya,Consumer,South,Phone,Technology,Phones,2,20000,0.00,3000"""

with open("superstore.csv", "w") as file:
    file.write(data)


PART 1: Pandas DataFrame Operations

In [2]:
#1
import pandas as pd

# Load CSV
df = pd.read_csv("superstore.csv")

# 2
print("Schema:\n", df.dtypes)

# Print head
print("\nFirst 5 rows:\n", df.head())

# Print shape
print("\nShape:", df.shape)


Schema:
 OrderID         object
OrderDate       object
Customer        object
Segment         object
Region          object
Product         object
Category        object
SubCategory     object
Quantity         int64
UnitPrice        int64
Discount       float64
Profit           int64
dtype: object

First 5 rows:
    OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   

          Category  SubCategory  Quantity  UnitPrice  Discount  Profit  
0       Technology    Computers         1      55000      0.10    5000  
1       Technology  Peripherals         2      12000      0.15    1800  
2  Office Supplies        Paper         3        200      0.05     150  
3 

In [3]:
#3
selected = df[['Customer', 'Product', 'Profit']]
print("Selected Columns:\n", selected)


Selected Columns:
   Customer   Product  Profit
0     Ravi    Laptop    5000
1    Priya   Printer    1800
2     Amit  Notebook     150
3    Anita     Table   -1500
4    Divya     Phone    3000


In [4]:
#4
filtered = df[(df['Profit'] > 2000) & (df['Discount'] == 0.00)]
print("Filtered Data (Profit > 2000 and Discount == 0):\n", filtered)


Filtered Data (Profit > 2000 and Discount == 0):
    OrderID   OrderDate Customer   Segment Region Product    Category  \
4  CA-1005  2023-02-05    Divya  Consumer  South   Phone  Technology   

  SubCategory  Quantity  UnitPrice  Discount  Profit  
4      Phones         2      20000       0.0    3000  


In [5]:
#5
sorted_df = df.sort_values(by='Profit', ascending=False)
print("Sorted by Profit (Descending):\n", sorted_df)


Sorted by Profit (Descending):
    OrderID   OrderDate Customer      Segment Region   Product  \
0  CA-1001  2023-01-15     Ravi     Consumer  South    Laptop   
4  CA-1005  2023-02-05    Divya     Consumer  South     Phone   
1  CA-1002  2023-02-20    Priya    Corporate  North   Printer   
2  CA-1003  2023-01-25     Amit     Consumer   East  Notebook   
3  CA-1004  2023-03-01    Anita  Home Office   West     Table   

          Category  SubCategory  Quantity  UnitPrice  Discount  Profit  
0       Technology    Computers         1      55000      0.10    5000  
4       Technology       Phones         2      20000      0.00    3000  
1       Technology  Peripherals         2      12000      0.15    1800  
2  Office Supplies        Paper         3        200      0.05     150  
3        Furniture       Tables         1      18000      0.20   -1500  


In [6]:
#6
grouped = df.groupby('Category').agg({
    'Profit': 'sum',
    'Discount': 'mean'
}).reset_index()

print("GroupBy Category → Total Profit & Avg Discount:\n", grouped)


GroupBy Category → Total Profit & Avg Discount:
           Category  Profit  Discount
0        Furniture   -1500  0.200000
1  Office Supplies     150  0.050000
2       Technology    9800  0.083333


In [7]:
#7
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
print("New Column TotalPrice:\n", df[['Quantity', 'UnitPrice', 'TotalPrice']])


New Column TotalPrice:
    Quantity  UnitPrice  TotalPrice
0         1      55000       55000
1         2      12000       24000
2         3        200         600
3         1      18000       18000
4         2      20000       40000


In [8]:
#8
df = df.drop(columns=['SubCategory'])
print("Columns after dropping SubCategory:\n", df.columns)


Columns after dropping SubCategory:
 Index(['OrderID', 'OrderDate', 'Customer', 'Segment', 'Region', 'Product',
       'Category', 'Quantity', 'UnitPrice', 'Discount', 'Profit',
       'TotalPrice'],
      dtype='object')


In [11]:
#9
df['Discount'] = df['Discount'].fillna(0.10)


In [12]:
#10
def classify(row):
    if row['Profit'] > 4000:
        return 'High'
    elif row['Profit'] > 0:
        return 'Medium'
    else:
        return 'Low'

df['ProfitLevel'] = df.apply(classify, axis=1)
print("Profit Classification:\n", df[['Profit', 'ProfitLevel']])


Profit Classification:
    Profit ProfitLevel
0    5000        High
1    1800      Medium
2     150      Medium
3   -1500         Low
4    3000      Medium


In [13]:
!pip install pyspark




PART 2 Pyspark DataFrame Operations

In [14]:
#1
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Create Spark session
spark = SparkSession.builder.appName("Superstore").getOrCreate()


In [15]:
#2
df_spark = spark.read.option("header", True).option("inferSchema", True).csv("superstore.csv")

# Show schema and first 5 rows
df_spark.printSchema()
df_spark.show(5)


root
 |-- OrderID: string (nullable = true)
 |-- OrderDate: date (nullable = true)
 |-- Customer: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- SubCategory: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: integer (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Profit: integer (nullable = true)

+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|OrderID| OrderDate|Customer|    Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|
+-------+----------+--------+-----------+------+--------+---------------+-----------+--------+---------+--------+------+
|CA-1001|2023-01-15|    Ravi|   Consumer| South|  Laptop|     Technology|  Computers|       1|    55000|     0.1|  5000|
|CA-1002|2023-02-20|   Priya|  

In [16]:
#3
df_selected = df_spark.select(col("Customer").alias("Client"), "Product", "Profit")
df_selected.show()


+------+--------+------+
|Client| Product|Profit|
+------+--------+------+
|  Ravi|  Laptop|  5000|
| Priya| Printer|  1800|
|  Amit|Notebook|   150|
| Anita|   Table| -1500|
| Divya|   Phone|  3000|
+------+--------+------+



In [17]:
#4
filtered = df_spark.filter((col("Segment") == "Consumer") & (col("Profit") < 1000))
filtered.show()


+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+
|OrderID| OrderDate|Customer| Segment|Region| Product|       Category|SubCategory|Quantity|UnitPrice|Discount|Profit|
+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+
|CA-1003|2023-01-25|    Amit|Consumer|  East|Notebook|Office Supplies|      Paper|       3|      200|    0.05|   150|
+-------+----------+--------+--------+------+--------+---------------+-----------+--------+---------+--------+------+



In [18]:
#5
from pyspark.sql.functions import avg

df_spark.groupBy("Region").agg(avg("Profit").alias("AverageProfit")).show()


+------+-------------+
|Region|AverageProfit|
+------+-------------+
| South|       4000.0|
|  East|        150.0|
|  West|      -1500.0|
| North|       1800.0|
+------+-------------+



In [19]:
#6
df_spark = df_spark.withColumn("TotalPrice", col("Quantity") * col("UnitPrice"))
df_spark.select("Quantity", "UnitPrice", "TotalPrice").show()


+--------+---------+----------+
|Quantity|UnitPrice|TotalPrice|
+--------+---------+----------+
|       1|    55000|     55000|
|       2|    12000|     24000|
|       3|      200|       600|
|       1|    18000|     18000|
|       2|    20000|     40000|
+--------+---------+----------+



In [20]:
#7
from pyspark.sql.functions import when

df_spark = df_spark.withColumn(
    "ProfitLevel",
    when(col("Profit") > 2000, "High")
    .when(col("Profit") <= 0, "Loss")
    .otherwise("Medium")
)

df_spark.select("Profit", "ProfitLevel").show()


+------+-----------+
|Profit|ProfitLevel|
+------+-----------+
|  5000|       High|
|  1800|     Medium|
|   150|     Medium|
| -1500|       Loss|
|  3000|       High|
+------+-----------+



In [21]:
#8
df_spark = df_spark.drop("SubCategory")
print("Remaining Columns:\n", df_spark.columns)


Remaining Columns:
 ['OrderID', 'OrderDate', 'Customer', 'Segment', 'Region', 'Product', 'Category', 'Quantity', 'UnitPrice', 'Discount', 'Profit', 'TotalPrice', 'ProfitLevel']


In [23]:
#9
df_spark = df_spark.fillna({'Discount': 0.10})


In [24]:
#10
from pyspark.sql.functions import to_date, year, month

df_spark = df_spark.withColumn("OrderDate", to_date("OrderDate", "yyyy-MM-dd"))
df_spark = df_spark.withColumn("OrderYear", year("OrderDate"))
df_spark = df_spark.withColumn("OrderMonth", month("OrderDate"))

df_spark.select("OrderDate", "OrderYear", "OrderMonth").show()


+----------+---------+----------+
| OrderDate|OrderYear|OrderMonth|
+----------+---------+----------+
|2023-01-15|     2023|         1|
|2023-02-20|     2023|         2|
|2023-01-25|     2023|         1|
|2023-03-01|     2023|         3|
|2023-02-05|     2023|         2|
+----------+---------+----------+



PART 3:Dask DataFrame Operations

In [25]:
#1
!pip install dask




In [26]:
#2
import dask.dataframe as dd

# Load CSV with Dask
df_dask = dd.read_csv("superstore.csv")

# Show structure
df_dask.head()  # Just shows the first few rows like pandas


Unnamed: 0,OrderID,OrderDate,Customer,Segment,Region,Product,Category,SubCategory,Quantity,UnitPrice,Discount,Profit
0,CA-1001,2023-01-15,Ravi,Consumer,South,Laptop,Technology,Computers,1,55000,0.1,5000
1,CA-1002,2023-02-20,Priya,Corporate,North,Printer,Technology,Peripherals,2,12000,0.15,1800
2,CA-1003,2023-01-25,Amit,Consumer,East,Notebook,Office Supplies,Paper,3,200,0.05,150
3,CA-1004,2023-03-01,Anita,Home Office,West,Table,Furniture,Tables,1,18000,0.2,-1500
4,CA-1005,2023-02-05,Divya,Consumer,South,Phone,Technology,Phones,2,20000,0.0,3000


In [27]:
#3
avg_discount = df_dask.groupby("Category")["Discount"].mean().compute()
print("Average Discount by Category:\n", avg_discount)


Average Discount by Category:
 Category
Furniture          0.200000
Office Supplies    0.050000
Technology         0.083333
Name: Discount, dtype: float64


In [28]:
filtered_dask = df_dask[(df_dask["Quantity"] > 1) & (df_dask["Profit"] > 2000)]
filtered_dask.compute()  # To preview the result


Unnamed: 0,OrderID,OrderDate,Customer,Segment,Region,Product,Category,SubCategory,Quantity,UnitPrice,Discount,Profit
4,CA-1005,2023-02-05,Divya,Consumer,South,Phone,Technology,Phones,2,20000,0.0,3000


In [29]:
filtered_dask.to_csv("filtered_superstore.csv", single_file=True, index=False)


['/content/filtered_superstore.csv']

PART 4: JSON Handling

In [30]:
import json

data = [
    {
        "OrderID": "CA-1001",
        "Customer": {"Name": "Ravi", "Segment": "Consumer"},
        "Details": {"Region": "South", "Profit": 5000}
    },
    {
        "OrderID": "CA-1002",
        "Customer": {"Name": "Priya", "Segment": "Corporate"},
        "Details": {"Region": "North", "Profit": 1800}
    }
]

# Save as orders.json
with open("orders.json", "w") as f:
    json.dump(data, f, indent=4)


In [31]:
# Read JSON with multiline support
df_json = spark.read.option("multiline", "true").json("orders.json")

# Print schema to understand nested structure
df_json.printSchema()


root
 |-- Customer: struct (nullable = true)
 |    |-- Name: string (nullable = true)
 |    |-- Segment: string (nullable = true)
 |-- Details: struct (nullable = true)
 |    |-- Profit: long (nullable = true)
 |    |-- Region: string (nullable = true)
 |-- OrderID: string (nullable = true)



In [32]:
df_json.select(
    "OrderID",
    "Customer.Name",
    "Customer.Segment",
    "Details.Region",
    "Details.Profit"
).show()


+-------+-----+---------+------+------+
|OrderID| Name|  Segment|Region|Profit|
+-------+-----+---------+------+------+
|CA-1001| Ravi| Consumer| South|  5000|
|CA-1002|Priya|Corporate| North|  1800|
+-------+-----+---------+------+------+

