# Worksheet 3 - Exploratory Data Analysis (Part II)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Create sample Titanic-like dataset
data = {
    'Name': ['John Smith', 'Jane Doe', 'Emily Brown', 'Michael Lee', 'Anna Johnson', 'Chris Evans', 'Sophia Patel', 'Tom Clark'],
    'Age': [22, 38, 26, 35, 17, 45, 28, np.nan],
    'Sex': ['male', 'female', 'female', 'male', 'female', 'male', 'female', 'male'],
    'Fare': [7.25, 71.83, 8.05, 53.1, 12.0, 30.0, 23.45, 80.0],
    'Pclass': [3, 1, 3, 1, 2, 2, 3, 1],
    'Survived': [0, 1, 1, 1, 0, 0, 1, 1],
    'Embarked': ['S', 'C', 'S', 'S', 'C', 'S', 'S', 'C']
}

df = pd.DataFrame(data)
df

## Warm-Up Exercises: Sorting and Subsetting

In [None]:
# Problem 1 - Sorting
fare = df[['Fare']]
fare.head()

In [None]:
class_age = df[['Pclass', 'Age']]
class_age.head()

In [None]:
survived_gender = df[['Survived', 'Sex']]
survived_gender.head()

In [None]:
# Sorting by Age
sorted_by_age = df.sort_values(by='Age')
sorted_by_age

In [None]:
# Visualization: Fare by Passenger
plt.figure(figsize=(8,4))
plt.bar(df['Name'], df['Fare'], color='skyblue')
plt.xticks(rotation=45, ha='right')
plt.title('Fare by Passenger')
plt.xlabel('Passenger Name')
plt.ylabel('Fare')
plt.tight_layout()
plt.show()

### Problem 2 - Subsetting

In [None]:
fare_gt_100 = df[df['Fare'] > 50]
fare_gt_100

In [None]:
first_class = df[df['Pclass'] == 1]
first_class

In [None]:
female_under_18 = df[(df['Age'] < 18) & (df['Sex'] == 'female')]
female_under_18

In [None]:
embarked_c_or_s = df[df['Embarked'].isin(['C', 'S'])]
embarked_c_or_s

In [None]:
first_second_class = df[df['Pclass'].isin([1, 2])]
first_second_class

In [None]:
# Visualization: Passenger Class Distribution
plt.figure(figsize=(6,4))
plt.hist(df['Pclass'], bins=3, color='orange', edgecolor='black')
plt.title('Passenger Class Distribution')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()

## EDA Practice Exercise - 1

In [None]:
# Fill missing age with median
df['Age'] = df['Age'].fillna(df['Age'].median())

# Highest fare relative to age
df['fare_per_year'] = df['Fare'] / df['Age']
high_fare_age = df[df['fare_per_year'] > 5]
high_fare_age_srt = high_fare_age.sort_values(by='fare_per_year', ascending=False)
result1 = high_fare_age_srt[['Name', 'fare_per_year']]
result1

In [None]:
# Visualization: Fare vs Age
plt.figure(figsize=(6,4))
plt.scatter(df['Age'], df['Fare'], color='purple')
plt.title('Fare vs Age')
plt.xlabel('Age')
plt.ylabel('Fare')
plt.grid(True)
plt.show()

In [None]:
# Adult male with highest fare per class
df['fare_per_class'] = df['Fare'] / df['Pclass']
adult_males = df[(df['Sex'] == 'male') & (df['Age'] >= 18)]
adult_males_srt = adult_males.sort_values(by='fare_per_class', ascending=False)
result2 = adult_males_srt[['Name', 'Age', 'fare_per_class']]
result2

In [None]:
# Visualization: Adult Males Fare per Class
plt.figure(figsize=(7,4))
plt.bar(result2['Name'], result2['fare_per_class'], color='green')
plt.title('Adult Males: Fare per Class')
plt.xlabel('Passenger Name')
plt.ylabel('Fare per Class')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Group-by Method Practice Exercise

In [None]:
# Percent of total fare revenue by class
total_fare = df['Fare'].sum()
first_fare = df[df['Pclass'] == 1]['Fare'].sum()
second_fare = df[df['Pclass'] == 2]['Fare'].sum()
third_fare = df[df['Pclass'] == 3]['Fare'].sum()

fare_list = [first_fare, second_fare, third_fare]
percent_fare = [round((x / total_fare) * 100, 2) for x in fare_list]
percent_fare

In [None]:
# Visualization: Pie chart of Fare Revenue by Class
plt.figure(figsize=(5,5))
plt.pie(percent_fare, labels=['1st Class', '2nd Class', '3rd Class'], autopct='%1.1f%%', colors=['gold', 'lightblue', 'lightgreen'])
plt.title('Fare Revenue by Class')
plt.show()

In [None]:
# Percent of passengers by age group
def categorize_age(age):
    if age < 18:
        return 'child'
    elif age < 65:
        return 'adult'
    else:
        return 'senior'

df['age_group'] = df['Age'].apply(categorize_age)
total_passengers = len(df)
age_counts = df['age_group'].value_counts()
percent_by_age = (age_counts / total_passengers) * 100
percent_by_age

In [None]:
# Visualization: Passenger Distribution by Age Group
plt.figure(figsize=(6,4))
plt.bar(percent_by_age.index, percent_by_age.values, color='teal')
plt.title('Passenger Distribution by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Percentage')
plt.show()

