# Download train.csv from kaggle titanic dataset: 
https://www.kaggle.com/competitions/titanic/overview

In [1]:
# Pandas Practice Exercises with Titanic Dataset
import pandas as pd
import numpy as np

In [2]:
# Load the dataset
df = pd.read_csv('train.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:

# Exercise 1: Display the first 5 rows of the dataset
first_five_rows = df.head()  # Display the first 5 rows
# Assert
assert first_five_rows.shape[0] == 5, f"Expected 5 rows, but got {first_five_rows.shape[0]}"

In [4]:
# Exercise 2: Calculate the mean age of passengers
mean_age = df["Age"].mean()  # Calculate mean age
# Assert
expected_mean_age = df['Age'].mean()
assert np.isclose(mean_age, expected_mean_age), f"Expected {expected_mean_age}, but got {mean_age}"


In [5]:

# Exercise 3: Find the number of missing values in the 'Cabin' column
missing_cabin = df['Cabin'].isnull().sum()  # Find number of missing values
# Assert
expected_missing_cabin = df['Cabin'].isnull().sum()
assert missing_cabin == expected_missing_cabin, f"Expected {expected_missing_cabin}, but got {missing_cabin}"


In [6]:

# Exercise 4: Filter the DataFrame to include only female passengers
female_df = df[df['Sex'] == 'female']  # Filter DataFrame
# Assert
assert female_df['Sex'].unique() == ['female'], "DataFrame contains non-female passengers"


In [7]:

# Exercise 5: Calculate the survival rate (percentage) of passengers
survival_rate = (df["Survived"].mean()) * 100  # Calculate survival rate
# Assert
expected_survival_rate = (df['Survived'].mean()) * 100
assert np.isclose(survival_rate, expected_survival_rate), f"Expected {expected_survival_rate}, but got {survival_rate}"


In [8]:

# Exercise 6: Create a new column 'FamilySize' as the sum of 'SibSp' and 'Parch'
df['FamilySize'] = df["SibSp"] + df["Parch"] # Create new column
# Assert
expected_family_size = df['SibSp'] + df['Parch']
assert df['FamilySize'].equals(expected_family_size), "Column 'FamilySize' not created correctly"


In [9]:

# Exercise 7: Find the most common embarkation point (i.e., the value that appears most frequently in 'Embarked')
most_common_embarkation = df["Embarked"].mode()[0]  # Find most common embarkation point
# Assert
expected_common_embarkation = df['Embarked'].mode()[0]
assert most_common_embarkation == expected_common_embarkation, f"Expected {expected_common_embarkation}, but got {most_common_embarkation}"


In [10]:

# Exercise 8: Calculate the mean fare for each class (Pclass)
mean_fare_by_class = df.groupby("Pclass")["Fare"].mean().to_dict()  # Calculate mean fare by class
# Assert
expected_mean_fare_by_class = df.groupby('Pclass')['Fare'].mean().to_dict()
assert mean_fare_by_class == expected_mean_fare_by_class, f"Expected {expected_mean_fare_by_class}, but got {mean_fare_by_class}"


In [11]:

# Exercise 9: Sort the DataFrame by 'Age' in ascending order
sorted_by_age_df = df.sort_values(by="Age")  # Sort DataFrame
# Assert
expected_sorted_names_by_age = df.sort_values(by='Age')['Name'].tolist()
assert sorted_by_age_df['Name'].tolist() == expected_sorted_names_by_age, "DataFrame not sorted correctly by age"


In [12]:

# Exercise 10: Reset the index of the sorted DataFrame
reset_sorted_by_age_df =  sorted_by_age_df.reset_index(drop=True) # Reset index
# Assert
assert reset_sorted_by_age_df.index.tolist() == list(range(df.shape[0])), f"Index not reset correctly"


In [13]:

# Exercise 11: Find the unique values in the 'Pclass' column
unique_pclass_values = df["Pclass"].unique()  # Find unique values
# Assert
expected_unique_pclass_values = df['Pclass'].unique()
assert np.array_equal(unique_pclass_values, expected_unique_pclass_values), f"Expected {expected_unique_pclass_values}, but got {unique_pclass_values}"


In [14]:
# Exercise 12: Filter passengers older than 30
older_than_30_df = df[df["Age"] > 30]  # Filter passengers older than 30
# Assert
assert older_than_30_df['Age'].min() > 30, "DataFrame contains passengers 30 or younger"

In [15]:
# Exercise 13: Create a pivot table with 'Pclass' as rows and 'Survived' as values, calculating the mean survival rate per class
pivot_table_survival = df.pivot_table(index='Pclass', values='Survived', aggfunc='mean') # Expected pivot table
expected_pivot_table_survival = df.pivot_table(values='Survived', index='Pclass', aggfunc='mean').to_dict()['Survived']
# Assert
assert pivot_table_survival['Survived'].to_dict() == expected_pivot_table_survival, f"Expected {expected_pivot_table_survival}, but got {pivot_table_survival}"

In [16]:
# Exercise 14: Calculate the standard deviation of the 'Fare' column
fare_std = df['Fare'].std()  # Calculate standard deviation
# Assert
expected_fare_std = df['Fare'].std()
assert np.isclose(fare_std, expected_fare_std), f"Expected {expected_fare_std}, but got {fare_std}"

In [17]:
# Exercise 15: Add a row for a new passenger with the following details: 
# Name: 'Test Passenger', Pclass: 3, Age: 25, SibSp: 0, Parch: 0, Ticket: '0000', Fare: 7.25, Cabin: 'E50', Embarked: 'S'
new_passenger = pd.DataFrame([{
    'PassengerId': df['PassengerId'].max() + 1,
    'Survived': 0,
    'Pclass': 3,
    'Name': 'Test Passenger',
    'Sex': 'male',
    'Age': 25,
    'SibSp': 0,
    'Parch': 0,
    'Ticket': '0000',
    'Fare': 7.25,
    'Cabin': 'E50',
    'Embarked': 'S'
}])
df = pd.concat([df, new_passenger], ignore_index=True)  # Add new passenger
# Assert
assert df.iloc[-1]['Name'] == 'Test Passenger', "New passenger not added correctly"


In [18]:


# Exercise 17: Filter the DataFrame for passengers who paid a fare greater than 100
high_fare_df = df[df['Fare'] > 100]  # Filter high fare passengers
# Assert
assert not high_fare_df.empty and high_fare_df['Fare'].min() > 100, "DataFrame contains passengers who paid 100 or less"

# Exercise 18: Calculate the average age of passengers for each embarkation point
average_age_by_embarkation = df.groupby('Embarked')['Age'].mean().to_dict()  # Calculate average age by embarkation
# Assert
expected_average_age_by_embarkation = df.groupby('Embarked')['Age'].mean().to_dict()
assert average_age_by_embarkation == expected_average_age_by_embarkation, f"Expected {expected_average_age_by_embarkation}, but got {average_age_by_embarkation}"

# Exercise 19: Find the number of passengers in each class
passenger_count_by_class = df['Pclass'].value_counts().to_dict()  # Find number of passengers in each class
# Assert
expected_passenger_count_by_class = df['Pclass'].value_counts().to_dict()
assert passenger_count_by_class == expected_passenger_count_by_class, f"Expected {expected_passenger_count_by_class}, but got {passenger_count_by_class}"

# Exercise 20: Create a new column 'AgeGroup' categorizing passengers as 'Child' (<18), 'Adult' (18-60), 'Senior' (>60)
def categorize_age(age):
    if age < 18:
        return 'Child'
    elif age <= 60:
        return 'Adult'
    else:
        return 'Senior'

df['AgeGroup'] = df['Age'].apply(categorize_age)  # Create new column
# Assert
expected_age_groups = df['Age'].apply(categorize_age).tolist()
assert df['AgeGroup'].tolist() == expected_age_groups, "Column 'AgeGroup' not created correctly"

print("All exercises completed successfully.")

All exercises completed successfully.
