In [1]:
import pandas as pd

In [2]:
# 2. LOADING THE DATASET
# We'll use the Titanic dataset from Stanford University
data_url = "https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv"

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(data_url)


In [3]:
# Print dataset information
print("=== DATASET INFORMATION ===")
print("Dataset Name: Titanic Passenger Data")
print(f"Source: {data_url}")
print("This dataset contains information about passengers aboard the Titanic")
print("It includes details like survival status, age, passenger class, gender, etc.")
print(f"Total records: {len(df)} passengers\n")

=== DATASET INFORMATION ===
Dataset Name: Titanic Passenger Data
Source: https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv
This dataset contains information about passengers aboard the Titanic
It includes details like survival status, age, passenger class, gender, etc.
Total records: 887 passengers



In [5]:
# 3. INITIAL DATA EXPLORATION
print("=== FIRST FEW ROWS ===")
print("Let's look at the first 3 rows to understand the data structure:")
print(df.head(3))
print("\n")

=== FIRST FEW ROWS ===
Let's look at the first 3 rows to understand the data structure:
   Survived  Pclass                                               Name  \
0         0       3                             Mr. Owen Harris Braund   
1         1       1  Mrs. John Bradley (Florence Briggs Thayer) Cum...   
2         1       3                              Miss. Laina Heikkinen   

      Sex   Age  Siblings/Spouses Aboard  Parents/Children Aboard     Fare  
0    male  22.0                        1                        0   7.2500  
1  female  38.0                        1                        0  71.2833  
2  female  26.0                        0                        0   7.9250  




In [6]:
# 4. DATA PREPROCESSING
print("=== DATA QUALITY CHECK ===")
print("Checking for missing values in each column:")
print(df.isnull().sum())

print("\nBasic statistics for numerical columns:")
print("This shows count, mean, standard deviation, min/max values etc.")
print(df.describe())

print("\nChecking data types of each column:")
print(df.dtypes)

print(f"\nDataset shape (rows, columns): {df.shape}\n")

=== DATA QUALITY CHECK ===
Checking for missing values in each column:
Survived                   0
Pclass                     0
Name                       0
Sex                        0
Age                        0
Siblings/Spouses Aboard    0
Parents/Children Aboard    0
Fare                       0
dtype: int64

Basic statistics for numerical columns:
This shows count, mean, standard deviation, min/max values etc.
         Survived      Pclass         Age  Siblings/Spouses Aboard  \
count  887.000000  887.000000  887.000000               887.000000   
mean     0.385569    2.305524   29.471443                 0.525366   
std      0.487004    0.836662   14.121908                 1.104669   
min      0.000000    1.000000    0.420000                 0.000000   
25%      0.000000    2.000000   20.250000                 0.000000   
50%      0.000000    3.000000   28.000000                 0.000000   
75%      1.000000    3.000000   38.000000                 1.000000   
max      1.000000  

In [10]:
# 5. DATA CLEANING AND FORMATTING
print("=== DATA CLEANING ===")
# Convert passenger class to categorical type
print("Converting 'Pclass' (Passenger Class) to categorical data type...")
df['Pclass'] = df['Pclass'].astype('category')

# Handle missing age values
print("\nHandling missing age values:")
print(f"Number of missing age values before: {df['Age'].isnull().sum()}")
print("Filling missing ages with median age...")
median_age = df['Age'].median()
df['Age'].fillna(median_age, inplace=True)
print(f"Median age used: {median_age} years")
print(f"Number of missing age values after: {df['Age'].isnull().sum()}\n")


=== DATA CLEANING ===
Converting 'Pclass' (Passenger Class) to categorical data type...

Handling missing age values:
Number of missing age values before: 0
Filling missing ages with median age...
Median age used: 28.0 years
Number of missing age values after: 0



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(median_age, inplace=True)


In [11]:
# 6. CATEGORICAL TO NUMERICAL CONVERSION
print("=== DATA TRANSFORMATION ===")
# Convert gender to numerical values
print("Converting 'Sex' column to numerical values (female=0, male=1)...")
df['Sex'] = df['Sex'].map({'female': 0, 'male': 1})

# Create dummy variables for passenger class
print("\nCreating dummy variables for passenger class...")
print("This creates separate columns for each class (1st, 2nd, 3rd)")
pclass_dummies = pd.get_dummies(df['Pclass'], prefix='Class')
df = pd.concat([df, pclass_dummies], axis=1)

=== DATA TRANSFORMATION ===
Converting 'Sex' column to numerical values (female=0, male=1)...

Creating dummy variables for passenger class...
This creates separate columns for each class (1st, 2nd, 3rd)


In [12]:
# 7. FINAL DATA CHECK
print("\n=== FINAL DATA CHECK ===")
print("First 3 rows after all transformations:")
print(df.head(3))

print("\nUpdated data types:")
print(df.dtypes)

print("\n=== PROCESSING COMPLETE ===")
print("The dataset is now cleaned and ready for analysis!")
print(f"Final shape: {df.shape}")


=== FINAL DATA CHECK ===
First 3 rows after all transformations:
   Survived Pclass                                               Name  Sex  \
0         0      3                             Mr. Owen Harris Braund    1   
1         1      1  Mrs. John Bradley (Florence Briggs Thayer) Cum...    0   
2         1      3                              Miss. Laina Heikkinen    0   

    Age  Siblings/Spouses Aboard  Parents/Children Aboard     Fare  Class_1  \
0  22.0                        1                        0   7.2500    False   
1  38.0                        1                        0  71.2833     True   
2  26.0                        0                        0   7.9250    False   

   Class_2  Class_3  
0    False     True  
1    False    False  
2    False     True  

Updated data types:
Survived                      int64
Pclass                     category
Name                         object
Sex                           int64
Age                         float64
Siblings/Spouse

In [13]:
# 1. IMPORTING LIBRARIES
# We need pandas for data manipulation and analysis
import pandas as pd

# 2. LOADING THE DATASET
# We'll use the Titanic dataset from Stanford University
data_url = "https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv"

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(data_url)

# Print dataset information
print("=== DATASET INFORMATION ===")
print("Dataset Name: Titanic Passenger Data")
print(f"Source: {data_url}")
print("This dataset contains information about passengers aboard the Titanic")
print("It includes details like survival status, age, passenger class, gender, etc.")
print(f"Total records: {len(df)} passengers\n")

# 3. INITIAL DATA EXPLORATION
print("=== FIRST FEW ROWS ===")
print("Let's look at the first 3 rows to understand the data structure:")
print(df.head(3))
print("\n")

# 4. DATA PREPROCESSING
print("=== DATA QUALITY CHECK ===")
print("Checking for missing values in each column:")
print(df.isnull().sum())

print("\nBasic statistics for numerical columns:")
print("This shows count, mean, standard deviation, min/max values etc.")
print(df.describe())

print("\nChecking data types of each column:")
print(df.dtypes)

print(f"\nDataset shape (rows, columns): {df.shape}\n")

# 5. DATA CLEANING AND FORMATTING
print("=== DATA CLEANING ===")
# Convert passenger class to categorical type
print("Converting 'Pclass' (Passenger Class) to categorical data type...")
df['Pclass'] = df['Pclass'].astype('category')

# Handle missing age values
print("\nHandling missing age values:")
print(f"Number of missing age values before: {df['Age'].isnull().sum()}")
print("Filling missing ages with median age...")
median_age = df['Age'].median()
df['Age'].fillna(median_age, inplace=True)
print(f"Median age used: {median_age:.1f} years")
print(f"Number of missing age values after: {df['Age'].isnull().sum()}\n")

# 6. CATEGORICAL TO NUMERICAL CONVERSION
print("=== DATA TRANSFORMATION ===")
# Convert gender to numerical values
print("Converting 'Sex' column to numerical values (female=0, male=1)...")
df['Sex'] = df['Sex'].map({'female': 0, 'male': 1})

# Create dummy variables for passenger class
print("\nCreating dummy variables for passenger class...")
print("This creates separate columns for each class (1st, 2nd, 3rd)")
pclass_dummies = pd.get_dummies(df['Pclass'], prefix='Class')
df = pd.concat([df, pclass_dummies], axis=1)

# 7. FINAL DATA CHECK
print("\n=== FINAL DATA CHECK ===")
print("First 3 rows after all transformations:")
print(df.head(3))

print("\nUpdated data types:")
print(df.dtypes)

print("\n=== PROCESSING COMPLETE ===")
print("The dataset is now cleaned and ready for analysis!")
print(f"Final shape: {df.shape}")

=== DATASET INFORMATION ===
Dataset Name: Titanic Passenger Data
Source: https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv
This dataset contains information about passengers aboard the Titanic
It includes details like survival status, age, passenger class, gender, etc.
Total records: 887 passengers

=== FIRST FEW ROWS ===
Let's look at the first 3 rows to understand the data structure:
   Survived  Pclass                                               Name  \
0         0       3                             Mr. Owen Harris Braund   
1         1       1  Mrs. John Bradley (Florence Briggs Thayer) Cum...   
2         1       3                              Miss. Laina Heikkinen   

      Sex   Age  Siblings/Spouses Aboard  Parents/Children Aboard     Fare  
0    male  22.0                        1                        0   7.2500  
1  female  38.0                        1                        0  71.2833  
2  female  26.0                        0                

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(median_age, inplace=True)
