Perform the following operations using Python on a suitable data set,
counting unique values of data, format of each column, converting
variable data type (e.g. from long to short, vice versa), identifying missing
values and filling in the missing values.

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the dataset
# Ensure 'Titanic.csv' is in your working directory
df = pd.read_csv('Titanic.csv')

print("--- Data Loaded ---")
print(df.head())

--- Data Loaded ---
   PassengerId  Survived  Pclass  \
0          892         0       3   
1          893         1       3   
2          894         0       2   
3          895         0       3   
4          896         1       3   

                                           Name     Sex   Age  SibSp  Parch  \
0                              Kelly, Mr. James    male  34.5      0      0   
1              Wilkes, Mrs. James (Ellen Needs)  female  47.0      1      0   
2                     Myles, Mr. Thomas Francis    male  62.0      0      0   
3                              Wirz, Mr. Albert    male  27.0      0      0   
4  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female  22.0      1      1   

    Ticket     Fare Cabin Embarked  
0   330911   7.8292   NaN        Q  
1   363272   7.0000   NaN        S  
2   240276   9.6875   NaN        Q  
3   315154   8.6625   NaN        S  
4  3101298  12.2875   NaN        S  


In [3]:
# Check the format (data type) of each column
print("--- Column Data Types ---")
print(df.dtypes)

--- Column Data Types ---
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


In [4]:
# Count unique values for a specific categorical column (e.g., 'Pclass')
print("--- Count of Unique Values in 'Pclass' ---")
print(df['Pclass'].value_counts())

# Check number of unique values in all columns
print("\n--- Number of Unique Values per Column ---")
print(df.nunique())

--- Count of Unique Values in 'Pclass' ---
Pclass
3    218
1    107
2     93
Name: count, dtype: int64

--- Number of Unique Values per Column ---
PassengerId    418
Survived         2
Pclass           3
Name           418
Sex              2
Age             79
SibSp            7
Parch            8
Ticket         363
Fare           169
Cabin           76
Embarked         3
dtype: int64


In [5]:
# Identify missing values (NaN/Null) in each column
print("--- Missing Values Count ---")
missing_values = df.isnull().sum()
print(missing_values)

--- Missing Values Count ---
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [6]:
# 1. Fill numeric missing values (Age) with the Mean
df['Age'] = df['Age'].fillna(df['Age'].mean())

# 2. Fill categorical missing values (Cabin) with a placeholder string
df['Cabin'] = df['Cabin'].fillna('Unknown')

# 3. Fill numeric missing values (Fare) with the Median
df['Fare'] = df['Fare'].fillna(df['Fare'].median())

print("--- Missing Values After Filling ---")
print(df.isnull().sum())

--- Missing Values After Filling ---
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


In [7]:
# Example: Convert 'PassengerId' from int64 (Long) to int16 (Short) to save memory
print(f"Original 'PassengerId' Type: {df['PassengerId'].dtype}")

# Converting to int16
df['PassengerId'] = df['PassengerId'].astype('int16')

print(f"Converted 'PassengerId' Type: {df['PassengerId'].dtype}")

# Example: Convert 'Survived' (0/1) to Boolean
print(f"Original 'Survived' Type: {df['Survived'].dtype}")
df['Survived'] = df['Survived'].astype('bool')
print(f"Converted 'Survived' Type: {df['Survived'].dtype}")

Original 'PassengerId' Type: int64
Converted 'PassengerId' Type: int16
Original 'Survived' Type: int64
Converted 'Survived' Type: bool
