In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=cf06b19155612431f54b5220faec1cdf4fd461bdec8c42a7adbea8b4433c0a56
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [None]:
from pyspark.sql import SparkSession
#initialize sparkSession
spark =  SparkSession.builder\
    .appName("Pyspark notebook example")\
    .getOrCreate()

print(spark)

<pyspark.sql.session.SparkSession object at 0x792c90adf0d0>


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
#Initialize SparkSession
spark = SparkSession.builder \
.appName("PySpark DataFrame Example") \
.getOrCreate()
#Sample data representing employees
data = [
("John Doe", "Engineering", 75000),
("Jane Smith", "Marketing", 60000),
("Sam Brown", "Engineering", 80000),
("Emily Davis", "HR", 50000),
("Michael Johnson", "Marketing", 70000),
]
#Define schema for DataFrame
columns= ["Name", "Department", "Salary"]
#Create DataFrame
df = spark.createDataFrame(data, schema=columns)
#Show the DataFrame
df.show()

+---------------+-----------+------+
|           Name| Department|Salary|
+---------------+-----------+------+
|       John Doe|Engineering| 75000|
|     Jane Smith|  Marketing| 60000|
|      Sam Brown|Engineering| 80000|
|    Emily Davis|         HR| 50000|
|Michael Johnson|  Marketing| 70000|
+---------------+-----------+------+



In [None]:

# Filter: Select employees with a salary greater than 65,000
high_salary_df = df.filter (col ("Salary") > 65000)
print("Employees with Salary > 65,000:")
high_salary_df.show()

Employees with Salary > 65,000:
+---------------+-----------+------+
|           Name| Department|Salary|
+---------------+-----------+------+
|       John Doe|Engineering| 75000|
|      Sam Brown|Engineering| 80000|
|Michael Johnson|  Marketing| 70000|
+---------------+-----------+------+



In [None]:
# Group by Department and calculate the average salary
avg_salary_df = df.groupBy("Department").avg("Salary")
print("Average Salary by Department:")
avg_salary_df.show()

Average Salary by Department:
+-----------+-----------+
| Department|avg(Salary)|
+-----------+-----------+
|Engineering|    77500.0|
|  Marketing|    65000.0|
|         HR|    50000.0|
+-----------+-----------+



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
#Initialize SparkSession
spark = SparkSession.builder\
.appName("Customer Transactions Analysis") \
.getOrCreate ()
# Sample data for customers
customers = [
(1, "Ravi", "Mumbai"),
(2, "Priya", "Delhi"),
(3, "Vijay", "Bangalore"),
(4, "Anita", "Chennai"),
(5, "Raj", "Hyderabad"),
]
# Sample data for transactions
transactions = [
(1, 1, 10000.50),
(2, 2, 20000.75),
(3, 1, 15000.25),
(4, 3, 30000.00),
(5, 2, 40000.50),
(6, 4, 25000.00),
(7, 5, 18000.75),
(8, 1, 5000.00),
]

#Define schema for DataFrames
customer_columns = ["CustomerID", "Name", "City"]
transaction_columns = ["TransactionID", "CustomerID", "Amount"]
#Create DataFrames
customer_df = spark.createDataFrame (customers, schema=customer_columns)
transaction_df = spark.createDataFrame (transactions, schema=transaction_columns)

#Show the DataFrames
print("Customers DataFrame:")
customer_df.show()
print("Transactions DataFrame:")
transaction_df.show()

Customers DataFrame:
+----------+-----+---------+
|CustomerID| Name|     City|
+----------+-----+---------+
|         1| Ravi|   Mumbai|
|         2|Priya|    Delhi|
|         3|Vijay|Bangalore|
|         4|Anita|  Chennai|
|         5|  Raj|Hyderabad|
+----------+-----+---------+

Transactions DataFrame:
+-------------+----------+--------+
|TransactionID|CustomerID|  Amount|
+-------------+----------+--------+
|            1|         1| 10000.5|
|            2|         2|20000.75|
|            3|         1|15000.25|
|            4|         3| 30000.0|
|            5|         2| 40000.5|
|            6|         4| 25000.0|
|            7|         5|18000.75|
|            8|         1|  5000.0|
+-------------+----------+--------+



In [None]:
#Join the DataFrames on CustomerID
customer_transactions_df = customer_df.join(transaction_df, on="CustomerID")
print("Customer Transactions DataFrame:")
customer_transactions_df.show()

#Calculate the total amount spent by each customer
total_spent_df = customer_transactions_df.groupBy("Name").sum("Amount").withColumnRenamed ("sum (Amount)", "TotalSpent")
print("Total Amount Spent by Each Customer:")
total_spent_df.show()

## Find customers who have spent more than ₹30,000
big_spenders_df = total_spent_df.filter (col ("TotalSpent") > 30000)
print("Customers Who Spent More Than ₹30,000:")
big_spenders_df.show()

## Count the number of transactions per customer
transactions_count_df = customer_transactions_df.groupBy("Name").count().withColumnRenamed ("count", "TransactionCount")
print("Number of Transactions Per Customer:")
transactions_count_df.show()

## Sort customers by total amount spent in descending order
sorted_spenders_df = total_spent_df.orderBy (col ("TotalSpent").desc())
print("Customers Sorted by Total Spent (Descending):")
sorted_spenders_df.show()

Customer Transactions DataFrame:
+----------+-----+---------+-------------+--------+
|CustomerID| Name|     City|TransactionID|  Amount|
+----------+-----+---------+-------------+--------+
|         1| Ravi|   Mumbai|            1| 10000.5|
|         1| Ravi|   Mumbai|            3|15000.25|
|         1| Ravi|   Mumbai|            8|  5000.0|
|         2|Priya|    Delhi|            2|20000.75|
|         2|Priya|    Delhi|            5| 40000.5|
|         3|Vijay|Bangalore|            4| 30000.0|
|         4|Anita|  Chennai|            6| 25000.0|
|         5|  Raj|Hyderabad|            7|18000.75|
+----------+-----+---------+-------------+--------+

Total Amount Spent by Each Customer:
+-----+-----------+
| Name|sum(Amount)|
+-----+-----------+
| Ravi|   30000.75|
|Priya|   60001.25|
|Vijay|    30000.0|
|Anita|    25000.0|
|  Raj|   18000.75|
+-----+-----------+



AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `TotalSpent` cannot be resolved. Did you mean one of the following? [`Name`, `sum(Amount)`].;
'Filter ('TotalSpent > 30000)
+- Aggregate [Name#55], [Name#55, sum(Amount#62) AS sum(Amount)#226]
   +- Project [CustomerID#54L, Name#55, City#56, TransactionID#60L, Amount#62]
      +- Join Inner, (CustomerID#54L = CustomerID#61L)
         :- LogicalRDD [CustomerID#54L, Name#55, City#56], false
         +- LogicalRDD [TransactionID#60L, CustomerID#61L, Amount#62], false


In [None]:
#### **Exercise: Product Sales Analysis**
#### **Step 1: Create DataFrames**

from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Product Sales Analysis") \
    .getOrCreate()

# Sample data for products
products = [
    (1, "Laptop", "Electronics", 50000),
    (2, "Smartphone", "Electronics", 30000),
    (3, "Table", "Furniture", 15000),
    (4, "Chair", "Furniture", 5000),
    (5, "Headphones", "Electronics", 2000),
]

# Sample data for sales transactions
sales = [
    (1, 1, 2),
    (2, 2, 1),
    (3, 3, 3),
    (4, 1, 1),
    (5, 4, 5),
    (6, 2, 2),
    (7, 5, 10),
    (8, 3, 1),
]

#Define schema for DataFrames
product_columns = ["ProductID", "Name", "Category", "Price"]
sales_columns = ["SaleID", "ProductID", "Quantity"]
#Create DataFrames
product_df = spark.createDataFrame (products, schema=product_columns)
transaction_columns = ["TransactionID", "CustomerID", "Amount"]
#Create DataFrames
product_df = spark.createDataFrame (products, schema=product_columns)
sales_df = spark.createDataFrame (sales, schema=sales_columns)

#Show the DataFrames
print("Products DataFrame:")
product_df.show()
print("Sales DataFrame:")
sales_df.show()

Products DataFrame:
+---------+----------+-----------+-----+
|ProductID|      Name|   Category|Price|
+---------+----------+-----------+-----+
|        1|    Laptop|Electronics|50000|
|        2|Smartphone|Electronics|30000|
|        3|     Table|  Furniture|15000|
|        4|     Chair|  Furniture| 5000|
|        5|Headphones|Electronics| 2000|
+---------+----------+-----------+-----+

Sales DataFrame:
+------+---------+--------+
|SaleID|ProductID|Quantity|
+------+---------+--------+
|     1|        1|       2|
|     2|        2|       1|
|     3|        3|       3|
|     4|        1|       1|
|     5|        4|       5|
|     6|        2|       2|
|     7|        5|      10|
|     8|        3|       1|
+------+---------+--------+



In [None]:
#### **Step 2: Perform the Following Tasks**
#1. **Join the DataFrames:**

product_sales_df = product_df.join(sales_df, on="ProductID")
print("Product Sales DataFrame:")
product_sales_df.show()

Product Sales DataFrame:
+---------+----------+-----------+-----+------+--------+
|ProductID|      Name|   Category|Price|SaleID|Quantity|
+---------+----------+-----------+-----+------+--------+
|        1|    Laptop|Electronics|50000|     1|       2|
|        1|    Laptop|Electronics|50000|     4|       1|
|        2|Smartphone|Electronics|30000|     2|       1|
|        2|Smartphone|Electronics|30000|     6|       2|
|        3|     Table|  Furniture|15000|     3|       3|
|        3|     Table|  Furniture|15000|     8|       1|
|        4|     Chair|  Furniture| 5000|     5|       5|
|        5|Headphones|Electronics| 2000|     7|      10|
+---------+----------+-----------+-----+------+--------+



In [None]:
#2. **Calculate Total Sales Value:**
combined_df = product_sales_df.withColumn("TotalSalesValue", col("Price") * col("Quantity"))
print("Combined DataFrame with Total Sales Value:")
combined_df.show()

Combined DataFrame with Total Sales Value:
+---------+----------+-----------+-----+------+--------+---------------+
|ProductID|      Name|   Category|Price|SaleID|Quantity|TotalSalesValue|
+---------+----------+-----------+-----+------+--------+---------------+
|        1|    Laptop|Electronics|50000|     1|       2|         100000|
|        1|    Laptop|Electronics|50000|     4|       1|          50000|
|        2|Smartphone|Electronics|30000|     2|       1|          30000|
|        2|Smartphone|Electronics|30000|     6|       2|          60000|
|        3|     Table|  Furniture|15000|     3|       3|          45000|
|        3|     Table|  Furniture|15000|     8|       1|          15000|
|        4|     Chair|  Furniture| 5000|     5|       5|          25000|
|        5|Headphones|Electronics| 2000|     7|      10|          20000|
+---------+----------+-----------+-----+------+--------+---------------+



In [None]:
#3. **Find the Total Sales for Each Product Category:**
category_sales_df = combined_df.groupBy("Category").agg({"TotalSalesValue": "sum"}).withColumnRenamed("sum(TotalSalesValue)", "TotalCategorySales")
print("Total Sales for Each Product Category:")
category_sales_df.show()

Total Sales for Each Product Category:
+-----------+------------------+
|   Category|TotalCategorySales|
+-----------+------------------+
|Electronics|            260000|
|  Furniture|             85000|
+-----------+------------------+



In [None]:
#4. **Identify the Top-Selling Product:**
product_sales_df = combined_df.groupBy("ProductID", "Name").agg({"TotalSalesValue": "sum"}).withColumnRenamed("sum(TotalSalesValue)", "TotalProductSales")
top_selling_product_df = product_sales_df.orderBy(col("TotalProductSales").desc()).limit(1)
print("Top-Selling Product:")
top_selling_product_df.show()



Top-Selling Product:
+---------+------+-----------------+
|ProductID|  Name|TotalProductSales|
+---------+------+-----------------+
|        1|Laptop|           150000|
+---------+------+-----------------+



In [None]:
#5. **Sort the Products by Total Sales Value:**
sorted_products_df = combined_df.orderBy(col("TotalSalesValue").desc())
print("Products Sorted by Total Sales Value:")
sorted_products_df.show()

Products Sorted by Total Sales Value:
+---------+----------+-----------+-----+------+--------+---------------+
|ProductID|      Name|   Category|Price|SaleID|Quantity|TotalSalesValue|
+---------+----------+-----------+-----+------+--------+---------------+
|        1|    Laptop|Electronics|50000|     1|       2|         100000|
|        2|Smartphone|Electronics|30000|     6|       2|          60000|
|        1|    Laptop|Electronics|50000|     4|       1|          50000|
|        3|     Table|  Furniture|15000|     3|       3|          45000|
|        2|Smartphone|Electronics|30000|     2|       1|          30000|
|        4|     Chair|  Furniture| 5000|     5|       5|          25000|
|        5|Headphones|Electronics| 2000|     7|      10|          20000|
|        3|     Table|  Furniture|15000|     8|       1|          15000|
+---------+----------+-----------+-----+------+--------+---------------+



In [None]:
#6. **Count the Number of Sales for Each Product:**
sales_count_df = sales_df.groupBy("ProductID").count().withColumnRenamed("count", "NumberOfSales")
print("Number of Sales for Each Product:")
sales_count_df.show()


Number of Sales for Each Product:
+---------+-------------+
|ProductID|NumberOfSales|
+---------+-------------+
|        1|            2|
|        3|            2|
|        2|            2|
|        5|            1|
|        4|            1|
+---------+-------------+



In [None]:
#7. **Filter the Products with Total Sales Value Greater Than ₹50,000:**
product_sales_df = product_sales_df.withColumnRenamed("sum(TotalSalesValue)", "TotalProductSales")
filtered_products_df = product_sales_df.filter(col("TotalProductSales") > 50000)
print("Products with Total Sales Value > ₹50,000:")
filtered_products_df.show()

Products with Total Sales Value > ₹50,000:
+---------+----------+-----------------+
|ProductID|      Name|TotalProductSales|
+---------+----------+-----------------+
|        1|    Laptop|           150000|
|        2|Smartphone|            90000|
|        3|     Table|            60000|
+---------+----------+-----------------+



In [None]:
### **Exercise: Analyzing a Sample Sales Dataset Using PySpark**
### **Part 1: Dataset Preparation**

#### **Step 1: Generate the Sample Sales Dataset**
import pandas as pd
from datetime import datetime

# Sample sales data
data = {
       "TransactionID": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
       "CustomerID": [101, 102, 103, 101, 104, 102, 103, 104, 101, 105],
       "ProductID": [501, 502, 501, 503, 504, 502, 503, 504, 501, 505],
       "Quantity": [2, 1, 4, 3, 1, 2, 5, 1, 2, 1],
       "Price": [150.0, 250.0, 150.0, 300.0, 450.0, 250.0, 300.0, 450.0, 150.0, 550.0],
       "Date": [
           datetime(2024, 9, 1),
           datetime(2024, 9, 1),
           datetime(2024, 9, 2),
           datetime(2024, 9, 2),
           datetime(2024, 9, 3),
           datetime(2024, 9, 3),
           datetime(2024, 9, 4),
           datetime(2024, 9, 4),
           datetime(2024, 9, 5),
           datetime(2024, 9, 5)
       ]
   }

# Create a DataFrame
pandas_df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
pandas_df.to_csv('sales_data.csv', index=False)

print("Sample sales dataset has been created and saved as 'sales_data.csv'.")

#2. **Verify the Dataset:**
pandas_df = pd.read_csv('sales_data.csv')
print(pandas_df)


Sample sales dataset has been created and saved as 'sales_data.csv'.
   TransactionID  CustomerID  ProductID  Quantity  Price        Date
0              1         101        501         2  150.0  2024-09-01
1              2         102        502         1  250.0  2024-09-01
2              3         103        501         4  150.0  2024-09-02
3              4         101        503         3  300.0  2024-09-02
4              5         104        504         1  450.0  2024-09-03
5              6         102        502         2  250.0  2024-09-03
6              7         103        503         5  300.0  2024-09-04
7              8         104        504         1  450.0  2024-09-04
8              9         101        501         2  150.0  2024-09-05
9             10         105        505         1  550.0  2024-09-05


In [None]:
#### **Step 2: Load the Dataset into PySpark**

#1. **Initialize the SparkSession:**
spark = SparkSession.builder \
    .appName("Sales Dataset Analysis") \
    .getOrCreate()

#2. **Load the CSV File into a PySpark DataFrame:**
df = spark.read.csv('sales_data.csv', header=True, inferSchema=True)
#Display the first few rows
df.show(5)

+-------------+----------+---------+--------+-----+----------+
|TransactionID|CustomerID|ProductID|Quantity|Price|      Date|
+-------------+----------+---------+--------+-----+----------+
|            1|       101|      501|       2|150.0|2024-09-01|
|            2|       102|      502|       1|250.0|2024-09-01|
|            3|       103|      501|       4|150.0|2024-09-02|
|            4|       101|      503|       3|300.0|2024-09-02|
|            5|       104|      504|       1|450.0|2024-09-03|
+-------------+----------+---------+--------+-----+----------+
only showing top 5 rows



In [None]:
#### **Step 3: Explore the Data**
#1. **Print the Schema:**
df.printSchema()

#2. **Show the First Few Rows:**
df.show(5)

#3. **Get Summary Statistics:**
df.describe("Quantity", "Price").show()

root
 |-- TransactionID: integer (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- ProductID: integer (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- Date: date (nullable = true)

+-------------+----------+---------+--------+-----+----------+
|TransactionID|CustomerID|ProductID|Quantity|Price|      Date|
+-------------+----------+---------+--------+-----+----------+
|            1|       101|      501|       2|150.0|2024-09-01|
|            2|       102|      502|       1|250.0|2024-09-01|
|            3|       103|      501|       4|150.0|2024-09-02|
|            4|       101|      503|       3|300.0|2024-09-02|
|            5|       104|      504|       1|450.0|2024-09-03|
+-------------+----------+---------+--------+-----+----------+
only showing top 5 rows

+-------+-----------------+-----------------+
|summary|         Quantity|            Price|
+-------+-----------------+-----------------+
|  count|            

In [None]:
#### **Step 4: Perform Data Transformations and Analysis**
#1. **Calculate the Total Sales Value for Each Transaction:**
df = df.withColumn("TotalSales", col("Quantity") * col("Price"))
df.show()

#2. **Group By ProductID and Calculate Total Sales Per Product:**
df.groupBy("ProductID").sum("TotalSales").alias("TotalProductSales").show()

#3. **Identify the Top-Selling Product:**
from pyspark.sql.functions import desc
df.groupBy("ProductID").sum("TotalSales").alias("TotalProductSales").orderBy(desc("sum(TotalSales)")).show(1)

#4. **Calculate the Total Sales by Date:**
df.groupBy("Date").sum("TotalSales").alias("TotalSalesByDate").orderBy("Date").show()

#5. **Filter High-Value Transactions:**
df.filter(col("TotalSales") > 500).show()


+-------------+----------+---------+--------+-----+----------+---------------+----------+
|TransactionID|CustomerID|ProductID|Quantity|Price|      Date|TotalSalesValue|TotalSales|
+-------------+----------+---------+--------+-----+----------+---------------+----------+
|            1|       101|      501|       2|150.0|2024-09-01|          300.0|     300.0|
|            2|       102|      502|       1|250.0|2024-09-01|          250.0|     250.0|
|            3|       103|      501|       4|150.0|2024-09-02|          600.0|     600.0|
|            4|       101|      503|       3|300.0|2024-09-02|          900.0|     900.0|
|            5|       104|      504|       1|450.0|2024-09-03|          450.0|     450.0|
|            6|       102|      502|       2|250.0|2024-09-03|          500.0|     500.0|
|            7|       103|      503|       5|300.0|2024-09-04|         1500.0|    1500.0|
|            8|       104|      504|       1|450.0|2024-09-04|          450.0|     450.0|
|         

In [None]:
### **Additional Challenge (Optional):**
#1. **Identify Repeat Customers:**
df.groupBy("CustomerID").count().filter(col("count") > 1).show()

#2. **Calculate the Average Sale Price Per Product:**
df.groupBy("ProductID").avg("Price").alias("AvgPricePerProduct").show()



+----------+-----+
|CustomerID|count|
+----------+-----+
|       101|    3|
|       103|    2|
|       102|    2|
|       104|    2|
+----------+-----+

+---------+----------+
|ProductID|avg(Price)|
+---------+----------+
|      501|     150.0|
|      504|     450.0|
|      502|     250.0|
|      505|     550.0|
|      503|     300.0|
+---------+----------+



In [None]:
from pyspark.sql import SparkSession
# Initialize SparkSession
spark = SparkSession.builder \
.appName("RDD Transformation Example") \
.getOrCreate()

# Get the SparkContext from the SparkSession
sc = spark.sparkContext
print("Spark Session Created")

data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
rdd = sc.parallelize(data)

# Print the original RDD
print("Original RDD:", rdd.collect())

rdd2 =  rdd.map(lambda x: x* 2)

#Print the transformed RDD
print("RDD after map transformation (x2):", rdd2.collect())



Spark Session Created
Original RDD: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
RDD after map transformation (x2): [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]


In [None]:
rdd3 = rdd2.filter(lambda x: x % 2 == 0)

#Print the filtered RDD
print("RDD after filter transformation (even numbers):", rdd3.collect())

RDD after filter transformation (even numbers): [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]


In [None]:
sentences = ["Hello world", "PySpark is great", "RDD transformations"]
rdd4 = sc.parallelize (sentences)
words_rdd = rdd4.flatMap(lambda sentence: sentence.split(" "))

# Print the flatMapped RDD
print("RDD after flatMap transformation (split into words):", words_rdd.collect())

RDD after flatMap transformation (split into words): ['Hello', 'world', 'PySpark', 'is', 'great', 'RDD', 'transformations']


In [None]:
results = rdd3.collect()
print(results)

[2, 4, 6, 8, 10, 12, 14, 16, 18, 20]


In [None]:
count =  rdd3.count()
print(f"Number of Elements: {count}")


Number of Elements: 10


In [None]:
total_sum = rdd.reduce(lambda x, y: x + y)
print(f"Total Sum: {total_sum}")

Total Sum: 55


In [None]:
### **Exercise: Working with Key-Value Pair RDDs in PySpark**
### **Step 1: Initialize Spark Context**

#1. **Initialize SparkSession and SparkContext:**
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("Key-Value Pair RDD Transformations") \
.getOrCreate()
sc = spark.sparkContext
print("Spark Session Created")



Spark Session Created


In [None]:
### **Step 2: Create and Explore the RDD**
#2. **Task 1: Create an RDD from the Sales Data**
sales_data = [
    ("ProductA", 100),
    ("ProductB", 150),
    ("ProductA", 200),
    ("ProductC", 300),
    ("ProductB", 250),
    ("ProductC", 100)
]

sales_rdd = sc.parallelize(sales_data)

# Print the first few elements of the RDD
print(sales_rdd.collect())



[('ProductA', 100), ('ProductB', 150), ('ProductA', 200), ('ProductC', 300), ('ProductB', 250), ('ProductC', 100)]


In [None]:
### **Step 3: Grouping and Aggregating Data**

#3. **Task 2: Group Data by Product Name**
grouped_sales_rdd = sales_rdd.groupByKey()

# To see the grouped data
grouped_sales = grouped_sales_rdd.mapValues(list).collect()
print("grouped sales:")
print(grouped_sales)

#4. **Task 3: Calculate Total Sales by Product**
total_sales_rdd = sales_rdd.reduceByKey(lambda a, b: a + b)

# Print the total sales for each product
print("Total Sales by Product:")
print(total_sales_rdd.collect())

#5. **Task 4: Sort Products by Total Sales**
sorted_sales_rdd = total_sales_rdd.sortBy(lambda x: x[1], ascending=False)

# Print the sorted list of products along with their sales amounts
print("Sorted Products by Total Sales:")
print(sorted_sales_rdd.collect())




grouped sales:
[('ProductA', [100, 200]), ('ProductB', [150, 250]), ('ProductC', [300, 100])]
Total Sales by Product:
[('ProductA', 300), ('ProductB', 400), ('ProductC', 400)]
Sorted Products by Total Sales:
[('ProductB', 400), ('ProductC', 400), ('ProductA', 300)]


In [None]:
### **Step 4: Additional Transformations**

#6. **Task 5: Filter Products with High Sales**
high_sales_rdd = total_sales_rdd.filter(lambda x: x[1] > 200)

# Print the products with high sales
print("Products with High Sales:")
print(high_sales_rdd.collect())

#7. **Task 6: Combine Regional Sales Data**
regional_sales_data = [
    ("ProductA", 50),
    ("ProductC", 150)
]

regional_sales_rdd = sc.parallelize(regional_sales_data)

combined_rdd = sales_rdd.union(regional_sales_rdd)
combined_total_sales_rdd = combined_rdd.reduceByKey(lambda a, b: a + b)

# Print the combined sales data
print("Combined Sales Data:")
print(combined_total_sales_rdd.collect())


Products with High Sales:
[('ProductA', 300), ('ProductB', 400), ('ProductC', 400)]
Combined Sales Data:
[('ProductA', 350), ('ProductC', 550), ('ProductB', 400)]


In [None]:
### **Step 5: Perform Actions on the RDD**

#8. **Task 7: Count the Number of Distinct Products**
distinct_products_count = sales_rdd.map(lambda x: x[0]).distinct().count()

# Print the count of distinct products
print("Number of Distinct Products:", distinct_products_count)

#9. **Task 8: Identify the Product with Maximum Sales**
total_sales_rdd = sales_rdd.reduceByKey(lambda x, y: x + y)
max_sales_product = total_sales_rdd.reduce(lambda a, b: a if a[1] > b[1] else b)

print(f"Product with maximum sales: {max_sales_product[0]} with sales amount: {max_sales_product[1]}")


Number of Distinct Products: 3
Product with maximum sales: ProductC with sales amount: 400


In [None]:
### **Challenge Task: Calculate the Average Sales per Product**

#10. *Challenge Task:**
#Calculate the average sales amount per product using the key-value pair RDD.
average_sales_per_product = total_sales_rdd.mapValues(lambda x: x / distinct_products_count).collect()

# Print the average sales per product
print("Average Sales per Product:")
print(average_sales_per_product)




Average Sales per Product:
[('ProductA', 100.0), ('ProductB', 133.33333333333334), ('ProductC', 133.33333333333334)]


In [None]:
# @title
#Create a DataFrames
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName("DataFrameOperations").getOrCreate()

# Sample data
data = [
    (1, 'John Doe', 'New York', 28),
    (2, 'Jane Smith', 'Los Angeles', 34),
    (3, 'Sam Brown', 'Chicago', 22),
    (4, 'Lisa Ray', 'Houston', 45)
]

# Creating a DataFrame
columns = ['CustomerID', 'Name', 'City', 'Age']
df = spark.createDataFrame(data, columns)
df.show()

#Selecting, Renaming, Filtering Data in a Pandas DataFrame
# Selecting a single column
df.select('Name').show()

# Selecting multiple columns
df.select('Name', 'City').show()

# Renaming a column
df.withColumnRenamed('City', 'Location').show()

# Filtering data
df.filter(col('Age') > 30).show()

+----------+----------+-----------+---+
|CustomerID|      Name|       City|Age|
+----------+----------+-----------+---+
|         1|  John Doe|   New York| 28|
|         2|Jane Smith|Los Angeles| 34|
|         3| Sam Brown|    Chicago| 22|
|         4|  Lisa Ray|    Houston| 45|
+----------+----------+-----------+---+

+----------+
|      Name|
+----------+
|  John Doe|
|Jane Smith|
| Sam Brown|
|  Lisa Ray|
+----------+

+----------+-----------+
|      Name|       City|
+----------+-----------+
|  John Doe|   New York|
|Jane Smith|Los Angeles|
| Sam Brown|    Chicago|
|  Lisa Ray|    Houston|
+----------+-----------+

+----------+----------+-----------+---+
|CustomerID|      Name|   Location|Age|
+----------+----------+-----------+---+
|         1|  John Doe|   New York| 28|
|         2|Jane Smith|Los Angeles| 34|
|         3| Sam Brown|    Chicago| 22|
|         4|  Lisa Ray|    Houston| 45|
+----------+----------+-----------+---+

+----------+----------+-----------+---+
|CustomerID|

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Employee Data Analysis") \
    .getOrCreate()

# Sample employee data
data = [
    (1, 'Arjun', 'IT', 75000),
    (2, 'Vijay', 'Finance', 85000),
    (3, 'Shalini', 'IT', 90000),
    (4, 'Sneha', 'HR', 50000),
    (5, 'Rahul', 'Finance', 60000),
    (6, 'Amit', 'IT', 55000)
]

# Define schema (columns)
columns = ['EmployeeID', 'EmployeeName', 'Department', 'Salary']

# Create DataFrame
employee_df = spark.createDataFrame(data, columns)

# Show the DataFrame
employee_df.show()

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Arjun|        IT| 75000|
|         2|       Vijay|   Finance| 85000|
|         3|     Shalini|        IT| 90000|
|         4|       Sneha|        HR| 50000|
|         5|       Rahul|   Finance| 60000|
|         6|        Amit|        IT| 55000|
+----------+------------+----------+------+



In [None]:
#1. **Task 1: Filter Employees by Salary**
filtered_df = employee_df.filter(col('Salary') > 60000)
print("Employees with Salary > 60000:")
filtered_df.show()

#2. **Task 2: Calculate the Average Salary by Department**
#from pyspark.sql.functions import avg
avg_salary_df = employee_df.groupBy('Department').avg('Salary').alias('AverageSalary')
print("Average Salary by Department:")
avg_salary_df.show()

#3. **Task 3: Sort Employees by Salary**
sorted_salary_df = employee_df.orderBy(col("Salary").desc())
print("Employees sorted by salary:")
sorted_salary_df.show()

#4. **Task 4: Add a Bonus Column**
#from pyspark.sql.functions import expr
bonus_df = employee_df.withColumn("Bonus", col("Salary") * 0.10)
print("Employee DataFrame with Bonus Column:")
bonus_df.show()




Employees with Salary > 60000:
+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Arjun|        IT| 75000|
|         2|       Vijay|   Finance| 85000|
|         3|     Shalini|        IT| 90000|
+----------+------------+----------+------+

Average Salary by Department:
+----------+-----------------+
|Department|      avg(Salary)|
+----------+-----------------+
|   Finance|          72500.0|
|        IT|73333.33333333333|
|        HR|          50000.0|
+----------+-----------------+

Employees sorted by salary:
+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         3|     Shalini|        IT| 90000|
|         2|       Vijay|   Finance| 85000|
|         1|       Arjun|        IT| 75000|
|         5|       Rahul|   Finance| 60000|
|         6|        Amit|        IT| 55000|
|         4|       Sneha|       

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
#Initialize Spark session
spark = SparkSession.builder\
.appName("Employee Data Handling") \
.getOrCreate()

#Sample employee data with null values
data = [
    (1, 'Arjun', 'IT', 75000),
    (2, 'Vijay', 'Finance', 85000),
    (3, None, 'IT', 90000),
    (4, 'Sneha', 'HR', None),
    (5, 'Rahul', None, 60000),
    (6, 'Amit', 'IT', 55000)
]

#Define schema (columns)
columns = ['EmployeeID', 'EmployeeName', 'Department', 'Salary']

#Create DataFrame
employee_df = spark.createDataFrame (data, columns)

# Show the DataFrame
employee_df.show()

#Fill null values in 'EmployeeName' and 'Department' with 'Unknown'
filled_df = employee_df.fillna({'EmployeeName': 'Unknown', 'Department': 'Unknown'})
filled_df.show()

#Drop rows where 'Salary' is null
dropped_null_salary_df= employee_df.dropna (subset=['Salary'])
dropped_null_salary_df.show()

#Fill null values in 'Salary' with 50000
salary_filled_df = employee_df.fillna({'Salary': 50000})
salary_filled_df.show()

#Check for null values in the entire DataFrame
null_counts = employee_df.select([col(c).isNull().alias(c) for c in employee_df.columns]).show()

#Replace all null values in the DataFrame with N/A
na_filled_df = employee_df.na.fill('N/A')
na_filled_df.show()


+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Arjun|        IT| 75000|
|         2|       Vijay|   Finance| 85000|
|         3|        NULL|        IT| 90000|
|         4|       Sneha|        HR|  NULL|
|         5|       Rahul|      NULL| 60000|
|         6|        Amit|        IT| 55000|
+----------+------------+----------+------+

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Arjun|        IT| 75000|
|         2|       Vijay|   Finance| 85000|
|         3|     Unknown|        IT| 90000|
|         4|       Sneha|        HR|  NULL|
|         5|       Rahul|   Unknown| 60000|
|         6|        Amit|        IT| 55000|
+----------+------------+----------+------+

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+-----

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
#Initialize a Spark session
spark = SparkSession.builder \
.appName ("Advanced DataFrame Operations") \
.getOrCreate()
# Create two sample DataFrames
datal = [
    (1, 'Arjun', 'IT', 75000, '2022-01-15'),
    (2, 'Vijay', 'Finance', 85000, '2022-03-12'),
    (3, 'Shalini', 'IT', 90000, '2021-06-30')
]
data2 = [
    (4, 'Sneha', 'HR', 50000, '2022-05-01'),
    (5, 'Rahul', 'Finance', 60000, '2022-08-20'),
    (6, 'Amit', 'IT', 55000, '2021-12-15')
]

#Define schema (columns)
columns = ['EmployeeID', 'EmployeeName', 'Department', 'Salary', 'JoiningDate']

#Create DataFrames
employee_df1 = spark.createDataFrame (datal, columns)
employee_df2 =  spark.createDataFrame (data2, columns)

#Show the DataFrames
employee_df1.show()
employee_df2.show()

# Union of two DataFrames (removes duplicates)
union_df = employee_df1.union (employee_df2).dropDuplicates()
print("Union of DataFrames (without duplicates):")
union_df.show()

# Union of two DataFrames (includes duplicates)
union_all_df = employee_df1.union(employee_df2)
print("Union of DataFrames (with duplicates):")
union_all_df.show()

from pyspark.sql.window import Window
from pyspark.sql.functions import rank

#Define a window specification to rank employees by salary within each department
window_spec =  Window.partitionBy("Department").orderBy(col("Salary").desc())

#Add a rank column to the DataFrame
ranked_df  = union_all_df.withColumn("Rank", rank().over(window_spec))
print("DataFrame with ranks:")
ranked_df.show()

from pyspark.sql.functions import sum

#Define a window specification for cumulative sum of salaries within each department
window_spec_sum =  Window.partitionBy("Department").orderBy("JoiningDate").rowsBetween(Window.unboundedPreceding, Window. currentRow)

# Calculate the running total of salaries
running_total_df = union_all_df.withColumn ("RunningTotal", sum(col ("Salary")).over(window_spec_sum))
print("DataFrame with running total:")
running_total_df.show()

# Convert JoiningDate from string to date type
date_converted_df =  union_all_df.withColumn("JoiningDate",F.to_date(col("JoiningDate"), "yyyy-MM-dd").cast("date"))
print("DataFrame with converted JoiningDate:")
date_converted_df.show()

# Replace invalid dates with null
#date_converted_df = date_converted_df.fillna({'JoiningDate': None})
#date_converted_df.show()

# Calculate the number of years since joining
experience_df = date_converted_df.withColumn("YearsOfExperience", F.round (F.datediff (F.current_date(), col ("JoiningDate")) /
365, 2))
print("DataFrame with years of experience:")
experience_df.show()

#Add a new column for next evaluation date (one year after joining)
eval_date_df =  date_converted_df.withColumn("NextEvaluationDate", F.date_add(col("JoiningDate"), 365))
print("DataFrame with next evaluation date:")
eval_date_df.show()

#Calculate average salary per department
avg_salary_df = union_all_df.groupBy("Department").agg(F.avg("Salary").alias("AverageSalary"))
print("Average salary per department:")
avg_salary_df.show()

# Calculate the total number of employees
total_employees_df = union_all_df.agg (F.count("EmployeeID").alias ("TotalEmployees"))
print("Total number of employees:")
total_employees_df.show()

# Convert employee names to uppercase
upper_name_df = union_all_df.withColumn("EmployeeNameUpper", F.upper(col("EmployeeName")))
print("DataFrame with employee names in uppercase:")
upper_name_df.show()



+----------+------------+----------+------+-----------+
|EmployeeID|EmployeeName|Department|Salary|JoiningDate|
+----------+------------+----------+------+-----------+
|         1|       Arjun|        IT| 75000| 2022-01-15|
|         2|       Vijay|   Finance| 85000| 2022-03-12|
|         3|     Shalini|        IT| 90000| 2021-06-30|
+----------+------------+----------+------+-----------+

+----------+------------+----------+------+-----------+
|EmployeeID|EmployeeName|Department|Salary|JoiningDate|
+----------+------------+----------+------+-----------+
|         4|       Sneha|        HR| 50000| 2022-05-01|
|         5|       Rahul|   Finance| 60000| 2022-08-20|
|         6|        Amit|        IT| 55000| 2021-12-15|
+----------+------------+----------+------+-----------+

Union of DataFrames (without duplicates):
+----------+------------+----------+------+-----------+
|EmployeeID|EmployeeName|Department|Salary|JoiningDate|
+----------+------------+----------+------+-----------+
|   

In [None]:
### Data Setup:

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Advanced DataFrame Operations - Different Dataset") \
    .getOrCreate()

# Create two sample DataFrames for Product Sales
data1 = [
    (1, 'Product A', 'Electronics', 1200, '2022-05-10'),
    (2, 'Product B', 'Clothing', 500, '2022-07-15'),
    (3, 'Product C', 'Electronics', 1800, '2021-11-05')
]

data2 = [
    (4, 'Product D', 'Furniture', 3000, '2022-03-25'),
    (5, 'Product E', 'Clothing', 800, '2022-09-12'),
    (6, 'Product F', 'Electronics', 1500, '2021-10-19')
]

# Define schema (columns)
columns = ['ProductID', 'ProductName', 'Category', 'Price', 'SaleDate']

# Create DataFrames
sales_df1 = spark.createDataFrame(data1, columns)
sales_df2 = spark.createDataFrame(data2, columns)

#show the dataframes
sales_df1.show()
sales_df2.show()

### Tasks:

#1. **Union of DataFrames (removing duplicates)**:
union_df = sales_df1.union(sales_df2).dropDuplicates()
print("Union of DataFrames (without duplicates):")
union_df.show()

#2. **Union of DataFrames (including duplicates)**:
union_all_df = sales_df1.union(sales_df2)
print("Union of DataFrames (with duplicates):")
union_all_df.show()

#3. **Rank products by price within their category**:
window_spec = Window.partitionBy("Category").orderBy(F.desc("Price"))
ranked_df = union_df.withColumn("Rank", F.row_number().over(window_spec))
print("DataFrame with ranks:")
ranked_df.show()

#4. **Calculate cumulative price per category**:
cumulative_df = union_df.withColumn("CumulativePrice", F.sum("Price").over(window_spec))
print("DataFrame with cumulative price:")
cumulative_df.show()

#5. **Convert `SaleDate` from string to date type**:
converted_df = union_df.withColumn("SaleDate", F.to_date("SaleDate", "yyyy-MM-dd"))
print("DataFrame with converted SaleDate:")
converted_df.show()

#6. **Calculate the number of days since each sale**:
days_since_sale_df = converted_df.withColumn("DaysSinceSale", F.datediff(F.current_date(), "SaleDate"))
print("DataFrame with days since sale:")
days_since_sale_df.show()

#7. **Add a column for the next sale deadline**:
next_sale_deadline_df = converted_df.withColumn("NextSaleDeadline", F.date_add("SaleDate", 30))
print("DataFrame with next sale deadline:")
next_sale_deadline_df.show()

#8. **Calculate total revenue and average price per category**:
revenue_df = union_df.groupBy("Category").agg(
    F.sum("Price").alias("TotalRevenue"),
    F.avg("Price").alias("AveragePrice")
)
print("Total revenue and average price per category:")
revenue_df.show()

#9. **Convert all product names to lowercase**:
lower_case_df = union_df.withColumn("ProductNameLower", F.lower("ProductName"))
print("DataFrame with product names in lowercase:")
lower_case_df.show()













+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        1|  Product A|Electronics| 1200|2022-05-10|
|        2|  Product B|   Clothing|  500|2022-07-15|
|        3|  Product C|Electronics| 1800|2021-11-05|
+---------+-----------+-----------+-----+----------+

+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        4|  Product D|  Furniture| 3000|2022-03-25|
|        5|  Product E|   Clothing|  800|2022-09-12|
|        6|  Product F|Electronics| 1500|2021-10-19|
+---------+-----------+-----------+-----+----------+

Union of DataFrames (without duplicates):
+---------+-----------+-----------+-----+----------+
|ProductID|ProductName|   Category|Price|  SaleDate|
+---------+-----------+-----------+-----+----------+
|        1|  Product A|Electronics| 1200|2022-05-10|
| 

In [3]:
from pyspark.sql import SparkSession

#create a spark seesion
spark= SparkSession.builder \
.appName("DtaIntegestion") \
.getOrCreate()

In [6]:
csv_file_path = "/content/sample_data/people.csv.txt"
#Now you can read it with PySpark
df_csv = spark.read.format("csv").option("header", "true").load(csv_file_path)
df_csv.show()

+----+---+------+
|Name|Age|Gender|
+----+---+------+
|John| 28|  Male|
|Jane| 32|Female|
+----+---+------+



In [8]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Define the schema for the JSON file
schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("gender", StringType(), True),
    StructField("address", StructType([
        StructField("street", StringType(), True),
        StructField("city", StringType(), True)
    ]), True)
])

#Load the complex 350N file with the correct path
json_file_path = "/content/sample_data/sample.json.txt"

#Read the JSON file with schema
df_json_complex = spark.read.schema(schema).json(json_file_path)

#Read the file as text to inspect its contents
with open(json_file_path, 'r') as f:
  data = f.read()
  print(data)


[
  {
    "name": "John",
    "age": 28,
    "gender": "Male",
    "address": {
      "street": "123 Main St",
      "city": "New York"
    }
  },
  {
    "name": "Jane",
    "age": 32,
    "gender": "Female",
    "address": {
      "street": "456 Elm St",
      "city": "San Francisco"
    }
  }
]


In [9]:
import pandas as pd

#Create a sample DataFrame
data = {
    "name": ["John", "Jane", "Mike", "Emily"],
    "age": [28, 32, 45, 231],
    "gender": ["Male", "Female", "Male", "Female"],
    "city": ["New York", "San Francisco", "Los Angeles", "Chicago"]
}

df = pd.DataFrame(data)

#Save the DataFrame to a CSV file in the Colal environment
csv_file_path="/content/sample_people.csv"
df.to_csv (csv_file_path, index=False)

#Confirm the file has been created
print (f"CSV file created at: {csv_file_path}")

from pyspark.sql import SparkSession

#Initialize Spark Session
spark = SparkSession.builder.appName("CreateViewExample").getOrCreate ()

#Load the CSV file into a PySpark DataFrame
df_people = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(csv_file_path)

#Show the DataFrame
df_people.show()


CSV file created at: /content/sample_people.csv
+-----+---+------+-------------+
| name|age|gender|         city|
+-----+---+------+-------------+
| John| 28|  Male|     New York|
| Jane| 32|Female|San Francisco|
| Mike| 45|  Male|  Los Angeles|
|Emily|231|Female|      Chicago|
+-----+---+------+-------------+



In [11]:
#Create a temporary view
df_people.createOrReplaceTempView("people_temp_view")

#Run an SQL query on the view
result_temp_view = spark.sql ("SELECT name, age, gender, city FROM people_temp_view WHERE age > 30")

# Show the result
result_temp_view.show()

#Create a global temporary view
df_people.createOrReplaceGlobalTempView("people_global_view")

#Query the global temporary view
result_global_view = spark.sql ("SELECT name, age, city FROM global_temp.people_global_view WHERE age < 30")

# Show the result
result_global_view.show()

#List all temporary views and tables
spark.catalog.listTables ()

#Drop the local temporary view
spark.catalog.dropTempView("people_temp_view")

#Drop the global temporary view
spark.catalog.dropGlobalTempView("people_global_view")

+-----+---+------+-------------+
| name|age|gender|         city|
+-----+---+------+-------------+
| Jane| 32|Female|San Francisco|
| Mike| 45|  Male|  Los Angeles|
|Emily|231|Female|      Chicago|
+-----+---+------+-------------+

+----+---+--------+
|name|age|    city|
+----+---+--------+
|John| 28|New York|
+----+---+--------+



True

In [None]:
# Create a new database in Spark SQL
spark.sql("CREATE DATABASE IF NOT EXISTS my_database")

# Use the created database
spark.sql("USE my_database")

# Verify that the database is being used
spark.sql("SHOW DATABASES").show()

In [2]:
import pandas as pd

# Create a sample CSV data
data = {
    "name": ["John", "Jane", "Mike", "Emily", "Alex"],
    "age": [28, 32, 45, 23, 36],
    "gender": ["Male", "Female", "Male", "Female", "Male"],
    "salary": [60000, 72000, 84000, 52000, 67000]
}

df = pd.DataFrame(data)

# Save the DataFrame as a CSV file
csv_file_path = "/content/sample_people.csv"
df.to_csv(csv_file_path, index=False)

# Confirm the CSV file is created
print(f"CSV file created at: {csv_file_path}")

from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("Employee Salary ETL").getOrCreate()

# Load CSV data
employee_df = spark.read.csv("sample_people.csv", header=True, inferSchema=True)
employee_df.show()

# Filter employees with age >= 30
filtered_df = employee_df.filter(employee_df.age >= 30)
print("Filtered DataFrame:")
filtered_df.show()

# Add a new column 'salary_with_bonus' (10% bonus on current salary)
from pyspark.sql.functions import col

transformed_df = filtered_df.withColumn("salary_with_bonus", col("salary") * 1.10)
print("Transformed DataFrame:")
transformed_df.show()

# Calculate average salary by gender
avg_salary_by_gender = transformed_df.groupBy("gender").avg("salary")
print("Average Salary by Gender:")
avg_salary_by_gender.show()

# Save the transformed data in Parquet format
transformed_df.write.parquet("transformed_sample_people.parquet")






CSV file created at: /content/sample_people.csv
+-----+---+------+------+
| name|age|gender|salary|
+-----+---+------+------+
| John| 28|  Male| 60000|
| Jane| 32|Female| 72000|
| Mike| 45|  Male| 84000|
|Emily| 23|Female| 52000|
| Alex| 36|  Male| 67000|
+-----+---+------+------+

Filtered DataFrame:
+----+---+------+------+
|name|age|gender|salary|
+----+---+------+------+
|Jane| 32|Female| 72000|
|Mike| 45|  Male| 84000|
|Alex| 36|  Male| 67000|
+----+---+------+------+

Transformed DataFrame:
+----+---+------+------+-----------------+
|name|age|gender|salary|salary_with_bonus|
+----+---+------+------+-----------------+
|Jane| 32|Female| 72000|          79200.0|
|Mike| 45|  Male| 84000|92400.00000000001|
|Alex| 36|  Male| 67000|          73700.0|
+----+---+------+------+-----------------+

Average Salary by Gender:
+------+-----------+
|gender|avg(salary)|
+------+-----------+
|Female|    72000.0|
|  Male|    75500.0|
+------+-----------+



In [6]:
#Full refresh: Load the entire dataset
df_sales = spark.read.format("CSV") \
.option("header", "true")\
.option("inferSchema", "true") \
.load("/content/sample_data/sales_data.csv.txt")


#Apply transformations (if necessary)
df_transformed  = df_sales.withColumn("total_sales", df_sales["quantity"] * df_sales["price"])

#Full refresh: Partition the data by 'date' and overwrite the existing data
output_path = "/content/sample_data/partitioned_data"
df_transformed.write.partitionBy("date").mode("overwrite").parquet(output_path)

#Verify partitioned data
partitioned_df = spark.read.parquet(output_path)
partitioned_df.show()

+--------------+-----------+--------+--------+-----+-------------------+-----------+----------+
|transaction_id|customer_id| product|quantity|price|         updated_at|total_sales|      date|
+--------------+-----------+--------+--------+-----+-------------------+-----------+----------+
|             1|        101|  Laptop|       1| 1000|2024-09-01 08:00:00|       1000|2024-09-01|
|             2|        102|   Phone|       2|  500|2024-09-01 09:00:00|       1000|2024-09-01|
|             5|        105|Keyboard|       1|   50|2024-09-03 12:00:00|         50|2024-09-03|
|             6|        106|   Mouse|       3|   30|2024-09-03 13:00:00|         90|2024-09-03|
|             3|        103|  Tablet|       1|  300|2024-09-02 10:00:00|        300|2024-09-02|
|             4|        104| Monitor|       2|  200|2024-09-02 11:00:00|        400|2024-09-02|
+--------------+-----------+--------+--------+-----+-------------------+-----------+----------+



In [7]:
#Full refresh: Load the entire dataset
df_sales = spark.read.format("CSV") \
.option("header", "true")\
.option("inferSchema", "true") \
.load("/content/sample_data/sales_data.csv.txt")


#Apply transformations (if necessary)
df_transformed  = df_sales.withColumn("total_sales", df_sales["quantity"] * df_sales["price"])

#Full refresh: Partition the data by 'date' and overwrite the existing data
output_path = "/content/sample_data/partitioned_data"
df_transformed.write.partitionBy("date").mode("append").parquet(output_path)

#Verify partitioned data
partitioned_df = spark.read.parquet(output_path)
partitioned_df.show()

+--------------+-----------+--------+--------+-----+-------------------+-----------+----------+
|transaction_id|customer_id| product|quantity|price|         updated_at|total_sales|      date|
+--------------+-----------+--------+--------+-----+-------------------+-----------+----------+
|             1|        101|  Laptop|       1| 1000|2024-09-01 08:00:00|       1000|2024-09-01|
|             2|        102|   Phone|       2|  500|2024-09-01 09:00:00|       1000|2024-09-01|
|             1|        101|  Laptop|       1| 1000|2024-09-01 08:00:00|       1000|2024-09-01|
|             2|        102|   Phone|       2|  500|2024-09-01 09:00:00|       1000|2024-09-01|
|             5|        105|Keyboard|       1|   50|2024-09-03 12:00:00|         50|2024-09-03|
|             6|        106|   Mouse|       3|   30|2024-09-03 13:00:00|         90|2024-09-03|
|             5|        105|Keyboard|       1|   50|2024-09-03 12:00:00|         50|2024-09-03|
|             6|        106|   Mouse|   

In [9]:
from pyspark.sql import functions as F

#Incremental load: Define the last ETL run timestamp (this should be tracked externally)
last_etl_run = '2024-09-01 00:00:00'

#Load only new or updated records since the last ETL run
df_incremental = spark.read.format("csv") \
.option("header", "true") \
.option("InferSchema", "true") \
.load("/content/sample_data/sales_data.csv.txt") \
.filter(F.col("updated_at") > last_etl_run)

#Apply transformations (if necessary)
df_transformed_incremental = df_incremental.withColumn("total_sales", df_incremental ["quantity"] * df_incremental ["price"])

#Incremental load: Append the new data to the existing partitioned dataset
output_path="/content/sample_data/partitioned_data"
df_transformed_incremental.write.partitionBy("date").mode("append").parquet (output_path)

#Verify partitioned data after incremental load
partitioned_df = spark.read.parquet(output_path)
partitioned_df.show()


+--------------+-----------+--------+--------+-----+-------------------+-----------+----------+
|transaction_id|customer_id| product|quantity|price|         updated_at|total_sales|      date|
+--------------+-----------+--------+--------+-----+-------------------+-----------+----------+
|             1|        101|  Laptop|       1| 1000|2024-09-01 08:00:00|       1000|2024-09-01|
|             2|        102|   Phone|       2|  500|2024-09-01 09:00:00|       1000|2024-09-01|
|             1|        101|  Laptop|       1| 1000|2024-09-01 08:00:00|       1000|2024-09-01|
|             2|        102|   Phone|       2|  500|2024-09-01 09:00:00|       1000|2024-09-01|
|             1|        101|  Laptop|       1| 1000|2024-09-01 08:00:00|       1000|2024-09-01|
|             2|        102|   Phone|       2|  500|2024-09-01 09:00:00|       1000|2024-09-01|
|             5|        105|Keyboard|       1|   50|2024-09-03 12:00:00|         50|2024-09-03|
|             6|        106|   Mouse|   

In [None]:
#widgets
# Install ipywidgets in Colab or Jupyter if needed
!pip install ipywidgets

In [22]:
from pyspark.sql import SparkSession
import ipywidgets as widgets
from IPython.display import display

# Step 1: Initialize a Spark session
spark = SparkSession.builder.appName("PySpark with Widgets Example").getOrCreate()

# Step 2: Create a simple DataFrame
data = [
    ("John", 28, "Male", 60000),
    ("Jane", 32, "Female", 72000),
    ("Mike", 45, "Male", 84000),
    ("Emily", 23, "Female", 52000),
    ("Alex", 36, "Male", 67000)
]

df = spark.createDataFrame(data, ["name", "age", "gender", "salary"])

# Show the DataFrame
df.show()

#step 3: create widgets
#Dropdown widget to select column for filtering
column_dropdown = widgets.Dropdown (
    options=["age", "salary"],
    value="age",
    description="Filter By:",
)

#Slider widget to choose a value for filtering
slider = widgets.IntSlider (
    value = 30,
    min=20,
    max=100,
    step=5,
    description="Threshold:",
    continuous_update=False
)

#Button to trigger filtering
button = widgets.Button(description="Apply Filter")

#Output area to show the results
output = widgets.Output()

#Display the widgets
display(column_dropdown, slider, button, output)

#Step 4: Define a function to apply the filtering based on widget inputs
def apply_filter(b):
  column = column_dropdown.value
  threshold = slider.value

# Clear previous output
output.clear_output()

#Filter the DataFrame based on widget values
df_filtered = df.filter(df[column] > threshold)

#Show the filtered DataFrame
with output:
  print (f"Filtering by {column} > {threshold}")
  df_filtered.show()

#Step 5: Attach the function to the button click event
button.on_click(apply_filter)

+-----+---+------+------+
| name|age|gender|salary|
+-----+---+------+------+
| John| 28|  Male| 60000|
| Jane| 32|Female| 72000|
| Mike| 45|  Male| 84000|
|Emily| 23|Female| 52000|
| Alex| 36|  Male| 67000|
+-----+---+------+------+



Dropdown(description='Filter By:', options=('age', 'salary'), value='age')

IntSlider(value=30, continuous_update=False, description='Threshold:', min=20, step=5)

Button(description='Apply Filter', style=ButtonStyle())

Output()

NameError: name 'column' is not defined

In [2]:
### **Exercise: PySpark Data Transformations on Movie Data**

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, to_date, avg

# Initialize Spark Session
spark = SparkSession.builder.appName("MovieDataTransformations").getOrCreate()

### **Tasks**:

#1. **Load the Dataset**:
file_path = "/content/sample_data/movies.csv.txt"
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Show the DataFrame
print("Initial DataFrame:")
df.show()

# *2. Filter Movies by Genre*
sci_fi_movies = df.filter(df.genre == "Sci-Fi")
print("Movies in the 'Sci-Fi' genre:")
sci_fi_movies.show()

# *3. Top-Rated Movies*
top_rated_movies = df.orderBy(col("rating").desc()).limit(3)
print("Top 3 highest-rated movies:")
top_rated_movies.show()

# *4. Movies Released After 2010*
df = df.withColumn("date", to_date(col("date"), "yyyy-MM-dd"))
movies_after_2010 = df.filter(year(col("date")) > 2010)
print("Movies released after 2010:")
movies_after_2010.show()

# *5. Calculate Average Box Office Collection by Genre*
avg_box_office_by_genre = df.groupBy("genre").agg(avg("box_office").alias("average_box_office"))
print("Average box office collection by genre:")
avg_box_office_by_genre.show()

# *6. Add a New Column for Box Office in Billions*
df = df.withColumn("box_office_in_billions", col("box_office") / 1e9)
print("DataFrame with box office in billions:")
df.show()

# *7. Sort Movies by Box Office Collection*
sorted_movies_by_box_office = df.orderBy(col("box_office").desc())
print("Movies sorted by box office collection (descending):")
sorted_movies_by_box_office.show()

# *8. Count the Number of Movies per Genre*
count_movies_per_genre = df.groupBy("genre").count()
print("Number of movies per genre:")
count_movies_per_genre.show()

# Stop Spark Session
spark.stop()

Initial DataFrame:
+--------+-----------------+---------+------+----------+----------+
|movie_id|            title|    genre|rating|box_office|      date|
+--------+-----------------+---------+------+----------+----------+
|       1|        Inception|   Sci-Fi|   8.8| 830000000|2010-07-16|
|       2|  The Dark Knight|   Action|   9.0|1004000000|2008-07-18|
|       3|     Interstellar|   Sci-Fi|   8.6| 677000000|2014-11-07|
|       4|Avengers: Endgame|   Action|   8.4|2797000000|2019-04-26|
|       5|    The Lion King|Animation|   8.5|1657000000|1994-06-15|
|       6|      Toy Story 4|Animation|   7.8|1073000000|2019-06-21|
|       7|        Frozen II|Animation|   7.0|1450000000|2019-11-22|
|       8|            Joker|    Drama|   8.5|1074000000|2019-10-04|
|       9|         Parasite|    Drama|   8.6| 258000000|2019-05-30|
+--------+-----------------+---------+------+----------+----------+

Movies in the 'Sci-Fi' genre:
+--------+------------+------+------+----------+----------+
|movie