In [17]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv('C:/Users/HP/Downloads/titanic.csv')  # Replace with your dataset path
print("Initial Dataset Shape:", df.shape)
df.head()



Initial Dataset Shape: (891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#step 1
#Identifying Missing Data

This step checks for missing data in each column. 
Identifying where missing values exist is crucial for 
deciding on an appropriate handling strategy.

In [21]:
# Check for missing values in each column
missing_data = df.isnull().sum()
print("Missing Values per Column:\n", missing_data)

# Display columns with missing values
missing_columns = missing_data[missing_data > 0]
print("Columns with missing values:\n", missing_columns)

Missing Values per Column:
 PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
Columns with missing values:
 Age         177
Cabin       687
Embarked      2
dtype: int64


step 2:Handling Missing Data

In [22]:
# Impute missing values for 'Age' with mean
df['Age'].fillna(df['Age'].mean(), inplace=True)

# Impute missing values for 'Embarked' with mode
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

# Drop 'Cabin' column due to a large number of missing values
if 'Cabin' in df.columns:
    df.drop(columns=['Cabin'], inplace=True)

# Check for missing data after imputation
print("Missing Values After Imputation:\n", df.isnull().sum())


Missing Values After Imputation:
 PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


Explanation: Here, we impute missing values for Age with the mean and for Embarked with the mode. The Cabin column, if it has many missing values, is dropped. After imputation, we recheck for any remaining missing data.

In [25]:
#step 3
#Removing Duplicate Records

In [26]:
# Check for duplicates
duplicates = df.duplicated().sum()
print("Number of Duplicate Records:", duplicates)

# Remove duplicate records
df.drop_duplicates(inplace=True)
print("Dataset Shape After Removing Duplicates:", df.shape)


Number of Duplicate Records: 0
Dataset Shape After Removing Duplicates: (891, 11)


Explanation: We check for duplicate rows in the dataset and remove them if any exist. Duplicates can lead to biased analysis and need to be removed.

In [28]:
#step 4
#Detecting and Handling Outliers


In [29]:
# Detect and handle outliers in 'Age' using the IQR method
Q1 = df['Age'].quantile(0.25)
Q3 = df['Age'].quantile(0.75)
IQR = Q3 - Q1

# Define the bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove outliers
df = df[(df['Age'] >= lower_bound) & (df['Age'] <= upper_bound)]
print("Dataset Shape After Removing Outliers:", df.shape)


Dataset Shape After Removing Outliers: (825, 11)


Explanation: Here, we use the IQR method to identify outliers in the Age column. Outliers are any values outside 1.5 times the IQR above the upper quartile or below the lower quartile. These rows are then removed.

In [30]:
#step 5
#Normalizing or Scaling Features


In [32]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler  # Add this line

# Load dataset
df = pd.read_csv(r'C:\Users\HP\Downloads\titanic.csv')  # Ensure this path is correct

# Normalizing 'Fare' and 'Age' columns using MinMaxScaler
scaler = MinMaxScaler()
df[['Fare', 'Age']] = scaler.fit_transform(df[['Fare', 'Age']])

# Display the first few rows to show normalization results
print(df.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex       Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  0.271174      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  0.472229      1   
2                             Heikkinen, Miss. Laina  female  0.321438      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  0.434531      1   
4                           Allen, Mr. William Henry    male  0.434531      0   

   Parch            Ticket      Fare Cabin Embarked  
0      0         A/5 21171  0.014151   NaN        S  
1      0          PC 17599  0.139136   C85        C  
2      0  STON/O2. 3101282  0.015469   NaN        S  
3      0            113803  0.103644  C123        S  
4      0            373450

Explanation: To standardize the values, we scale the Fare and Age columns using Min-Max scaling, which maps the values between 0 and 1.

In [33]:
print("Final Dataset Shape:", df.shape)
df.head()


Final Dataset Shape: (891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,0.271174,1,0,A/5 21171,0.014151,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,0.472229,1,0,PC 17599,0.139136,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,0.321438,0,0,STON/O2. 3101282,0.015469,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,0.434531,1,0,113803,0.103644,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,0.434531,0,0,373450,0.015713,,S
