# 1. Data Understanding 

In [5]:
import pandas as pd 
from seaborn import load_dataset

In [6]:
# load dataset 
data = load_dataset('titanic')

In [10]:
# Get basic information about dataset 
print("data set info:")
print(data.info()) 
#datatypes,missing values ,etc
    

data set info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB
None


In [12]:
# Display summary statics for numerical columns 
print("\n Summary statics")
print(data.describe())


 Summary statics
         survived      pclass         age       sibsp       parch        fare
count  891.000000  891.000000  714.000000  891.000000  891.000000  891.000000
mean     0.383838    2.308642   29.699118    0.523008    0.381594   32.204208
std      0.486592    0.836071   14.526497    1.102743    0.806057   49.693429
min      0.000000    1.000000    0.420000    0.000000    0.000000    0.000000
25%      0.000000    2.000000   20.125000    0.000000    0.000000    7.910400
50%      0.000000    3.000000   28.000000    0.000000    0.000000   14.454200
75%      1.000000    3.000000   38.000000    1.000000    0.000000   31.000000
max      1.000000    3.000000   80.000000    8.000000    6.000000  512.329200


In [13]:
# Display the first few rows of the dataset 
data.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


# 2. Data Cleaning

In [16]:
# check for missing values in dataset 
missing_values = data.isnull().sum()
print("Missing values in Each Column:")
print(missing_values[missing_values > 0])


Missing values in Each Column:
age            177
embarked         2
deck           688
embark_town      2
dtype: int64


In [23]:
# Impute missing values un 'age' with the median 
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
data['age'] = imputer.fit_transform(data[['age']])

# Drops rows where 'embarked' has missing values 
data.dropna(subset=['embarked'],inplace=True)
data.dropna(subset=['deck'],inplace=True)



# verify that missing values are handled 
print("Missing values after imputaion ")
print(data.isnull().sum())

Missing values after imputaion 
survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64


In [26]:
#  Removing Duplicates
#remove duplicate rows 
before = data.shape[0]
data = data.drop_duplicates()
after = data.shape[0]
print(f"Removed {before-after} duplicate rows")


Removed 0 duplicate rows


In [27]:
# Outlier Detection and Treatment (using IQR method)
# Deteact outlier using the IQR method for the age column 
Q1 = data['age'].quantile(0.25)
Q3 = data['age'].quantile(0.75)
IQR = Q3 - Q1

# Filter oout outliers 
outliers = data[(data['age']< (Q1 - 1.5 *IQR)) | data['age'] > (Q3 + 1.5*IQR)]
print(f"Number of Outlier detected, {len(outliers)} ")

# IQR (INTER QUARTLE RANGE ) - Measures the spread of the middle 50% of data.


Number of Outlier detected, 0 


# 3.data Transformation 

In [31]:
# 1. Scaling and noamalization 

In [32]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler, RobustScaler 

# Min-Max Scaling for 'age'
min_max_scaler = MinMaxScaler()
data['age_scaled'] = min_max_scaler.fit_transform(data[['age']])

In [35]:
# Standarization of fare 
standard_scaler = StandardScaler()
data['fare_standarized'] = standard_scaler.fit_transform(data[['fare']])


In [36]:
# Robust Scaling to reduce the effect of outliers on 'fare'
robust_scaler = RobustScaler()
data['fare_robust'] = robust_scaler.fit_transform(data[['fare']])

In [38]:
# 2. Encoding categorical data

In [39]:
# Label Encoding
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
LE = LabelEncoder()
data['sex_encoded'] = LE.fit_transform(data['sex']) # Turns male into 1 and Female :0

In [42]:
# One Hot Encoding
embarked_dummies = pd.get_dummies(data['embarked'],prefix='embarked')
data = pd.concat([data,embarked_dummies],axis=1)
# Display the data 
print(data[['sex','sex_encoded','embarked_C','embarked_Q','embarked_S']].head())

       sex  sex_encoded  embarked_C  embarked_C  embarked_Q  embarked_Q  \
1   female            0        True        True       False       False   
3   female            0       False       False       False       False   
6     male            1       False       False       False       False   
10  female            0       False       False       False       False   
11  female            0       False       False       False       False   

    embarked_S  embarked_S  
1        False       False  
3         True        True  
6         True        True  
10        True        True  
11        True        True  


In [44]:
# 3. feature Transformation 


In [49]:
from sklearn.preprocessing import FunctionTransformer

# Log tranform on 'fare' to reduce skewnesss
log_transformer = FunctionTransformer(np.log1p,validate=True)


# left 


# 4. Feature Engineering 

In [53]:
# 1.create a new feature 
data['family_size'] = data['sibsp'] + data['parch'] + 1 

print(data[['sibsp','parch','family_size']].head())

    sibsp  parch  family_size
1       1      0            2
3       1      0            2
6       0      0            1
10      1      1            3
11      0      0            1


In [54]:
# 2.Extracting features from text
data['title'] = data['name'].str.extract('([A-Za-z]+)\\.',expand=False)


# count teh occurecnce of each title


KeyError: 'name'

In [55]:
data

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,...,embarked_C,embarked_Q,embarked_S,embarked_C.1,embarked_Q.1,embarked_S.1,fare_log,age_reciprocal,age_sqrt,family_size
1,1,1,female,38.0,1,0,71.2833,C,First,woman,...,True,False,False,True,False,False,4.280593,0.026316,6.164414,2
3,1,1,female,35.0,1,0,53.1000,S,First,woman,...,False,False,True,False,False,True,3.990834,0.028571,5.916080,2
6,0,1,male,54.0,0,0,51.8625,S,First,man,...,False,False,True,False,False,True,3.967694,0.018519,7.348469,1
10,1,3,female,4.0,1,1,16.7000,S,Third,child,...,False,False,True,False,False,True,2.873565,0.249999,2.000000,3
11,1,1,female,58.0,0,0,26.5500,S,First,woman,...,False,False,True,False,False,True,3.316003,0.017241,7.615773,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,First,woman,...,False,False,True,False,False,True,3.980694,0.021277,6.855655,3
872,0,1,male,33.0,0,0,5.0000,S,First,man,...,False,False,True,False,False,True,1.791759,0.030303,5.744563,1
879,1,1,female,56.0,0,1,83.1583,C,First,woman,...,True,False,False,True,False,False,4.432700,0.017857,7.483315,2
887,1,1,female,19.0,0,0,30.0000,S,First,woman,...,False,False,True,False,False,True,3.433987,0.052632,4.358899,1
