<!-- use this command in cmd - spark-shell -->

In [5]:
from pyspark.sql import SparkSession
import seaborn as sns

# Create a SparkSession
spark = SparkSession.builder \
    .appName("TitanicAnalysis") \
    .getOrCreate()

# Load the Titanic dataset
titanic_df = spark.createDataFrame(sns.load_dataset("titanic"))

titanic_df = titanic_df.fillna({'Age': 0})

# Filter data for male passengers who died and remove null values from Age column
male_deceased = titanic_df.filter((titanic_df["Sex"] == "male") & (titanic_df["Survived"] == 0) & titanic_df["Age"].isNotNull())

# Check if there are any male passengers who died
male_deceased_count = male_deceased.count()

if male_deceased_count > 0:
    # Calculate the average age of male passengers who died
# Calculate the average age of male passengers who died
    male_deceased_age_avg = male_deceased.agg({"Age": "avg"}).collect()[0][0]
    print("Number of male passengers who died:", male_deceased_count)
    print("Average age of male passengers who died:", male_deceased_age_avg)
else:
    print("No male passengers found who died in the dataset.")
    
female_deceased_by_class = titanic_df.filter((titanic_df["Sex"] == "female") & (titanic_df["Survived"] == 0)).groupBy("Pclass").count()

# Display results
print("Number of deceased passengers in each class among females:")
female_deceased_by_class.show()
# Stop SparkSession
spark.stop()


Number of male passengers who died: 468
Average age of male passengers who died: 24.321581196581196
Number of deceased passengers in each class among females:
+------+-----+
|Pclass|count|
+------+-----+
|     3|   72|
|     2|    6|
|     1|    3|
+------+-----+



In [None]:
from pyspark.sql import SparkSession: Imports the SparkSession class from the pyspark.sql module. SparkSession is the entry point to Spark SQL functionality.
import seaborn as sns: Imports the Seaborn library, which is used for data visualization.
spark = SparkSession.builder \ .appName("TitanicAnalysis") \ .getOrCreate(): Creates a SparkSession named "TitanicAnalysis". appName sets the name of the application.
titanic_df = spark.createDataFrame(sns.load_dataset("titanic")): Loads the Titanic dataset using Seaborn's load_dataset function and converts it into a Spark DataFrame using createDataFrame.
titanic_df = titanic_df.fillna({'Age': 0}): Fills missing values in the "Age" column with 0.
male_deceased = titanic_df.filter((titanic_df["Sex"] == "male") & (titanic_df["Survived"] == 0) & titanic_df["Age"].isNotNull()): Filters the DataFrame to select male passengers who did not survive and whose age is not null.
male_deceased_count = male_deceased.count(): Counts the number of male passengers who did not survive.
if male_deceased_count > 0:: Checks if there are male passengers who did not survive.
male_deceased_age_avg = male_deceased.agg({"Age": "avg"}).collect()[0][0]: Calculates the average age of male passengers who did not survive using the agg function with the avg aggregate function.
print("Number of male passengers who died:", male_deceased_count): Prints the number of male passengers who did not survive.
print("Average age of male passengers who died:", male_deceased_age_avg): Prints the average age of male passengers who did not survive.
female_deceased_by_class = titanic_df.filter((titanic_df["Sex"] == "female") & (titanic_df["Survived"] == 0)).groupBy("Pclass").count(): Filters the DataFrame to select female passengers who did not survive, groups them by passenger class, and counts the number of passengers in each class.
print("Number of deceased passengers in each class among females:"): Prints a message.
female_deceased_by_class.show(): Displays the number of deceased passengers in each class among females.
spark.stop(): Stops the SparkSession, releasing the resources associated with it.
In summary, this code performs basic analysis on the Titanic dataset using PySpark. It calculates the number and average age of male passengers who did not survive, as well as the number of deceased passengers in each class among females.