# 1. Implement a PySpark Script for Monte Carlo Simulations

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import random

# Create a Spark session
spark = SparkSession.builder.appName("MonteCarloSimulation").getOrCreate()

# Number of simulations
num_samples = 1000000

# Function to generate random points and count how many fall inside the unit circle
def monte_carlo_simulation(num_samples):
    inside_circle = 0
    for _ in range(num_samples):
        x = random.uniform(0, 1)
        y = random.uniform(0, 1)
        if x**2 + y**2 <= 1:
            inside_circle += 1
    return inside_circle

# Create an RDD with the number of simulations per partition
num_partitions = 10
samples_per_partition = num_samples // num_partitions
rdd = spark.sparkContext.parallelize(range(num_partitions))

# Run Monte Carlo simulations in parallel
results = rdd.map(lambda _: monte_carlo_simulation(samples_per_partition)).collect()

# Calculate π
total_inside_circle = sum(results)
estimated_pi = (total_inside_circle / num_samples) * 4

print(f"Estimated value of π: {estimated_pi}")

# Stop the Spark session
spark.stop()


24/10/07 10:02:32 WARN Utils: Your hostname, PGLab2 resolves to a loopback address: 127.0.1.1; using 172.16.57.83 instead (on interface enp1s0)
24/10/07 10:02:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/07 10:02:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/10/07 10:02:32 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

Estimated value of π: 3.139152


# 2. Define and Apply Probability Distributions in PySpark

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import numpy as np

# Create a Spark session
spark = SparkSession.builder.appName("MonteCarloWithDistributions").getOrCreate()

# Function to generate random parameters and run the simulation
def monte_carlo_with_distributions(num_samples):
    inside_circle = 0
    for _ in range(num_samples):
        # Generate random numbers with normal distribution for x and y
        x = np.random.normal(0.5, 0.1)  # Mean = 0.5, Std = 0.1
        y = np.random.normal(0.5, 0.1)
        
        # Check if the point is inside the unit circle
        if 0 <= x <= 1 and 0 <= y <= 1 and (x - 0.5)**2 + (y - 0.5)**2 <= (0.5)**2:
            inside_circle += 1
    return inside_circle

# Number of simulations
num_samples = 1000000
num_partitions = 10
samples_per_partition = num_samples // num_partitions

# Create an RDD with the number of simulations per partition
rdd = spark.sparkContext.parallelize(range(num_partitions))

# Run Monte Carlo simulations in parallel
results = rdd.map(lambda _: monte_carlo_with_distributions(samples_per_partition)).collect()

# Calculate π
total_inside_circle = sum(results)
estimated_pi = (total_inside_circle / num_samples) * 4

print(f"Estimated value of π with probability distributions: {estimated_pi}")

# Stop the Spark session
spark.stop()


24/10/07 10:03:17 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
                                                                                

Estimated value of π with probability distributions: 4.0
