<a href="https://colab.research.google.com/github/zahid-bracu/feature-engineering/blob/main/Part_6_1_Pipleline_Titanic_Dataset_with_out_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

####Load Libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

####Load Dataset

In [2]:
# Import necessary libraries
import seaborn as sns

# Load the Titanic dataset
df = sns.load_dataset('titanic')

# Display the first few rows of the dataset
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


####Drop unnecessary Columns

In [3]:
df.drop(columns=['class','who','adult_male','deck','embark_town','alive','alone'],inplace=True)

####View Columns

In [4]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


####Check Null values in columns

In [5]:
df.isnull().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
dtype: int64

####Fill Age column null values

In [6]:
# Identify the column to fill (e.g., 'age')
column_name = 'age'

# Calculate the mean of the column
mean_value = df[column_name].mean()

# Fill the missing values in the column with the mean value
df[column_name].fillna(mean_value, inplace=True)

####Check Null values in columns

In [7]:
df.isnull().sum()

survived    0
pclass      0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    2
dtype: int64

####Fill embarked column null values

In [8]:
# Identify the column to fill (e.g., 'embarked')
column_name = 'embarked'

# Calculate the mode of the column
mode_value = df[column_name].mode()[0]

# Fill the missing values in the column with the mode value
df[column_name].fillna(mode_value, inplace=True)

####Check Null values in columns

In [9]:
df.isnull().sum()

survived    0
pclass      0
sex         0
age         0
sibsp       0
parch       0
fare        0
embarked    0
dtype: int64

####View Datas

In [10]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


####One hot encoder in 'Sex' column

In [11]:
from sklearn.preprocessing import OneHotEncoder
# Create an instance of OneHotEncoder
encoder = OneHotEncoder(sparse=False)  # drop='first' to avoid multicollinearity

# Fit and transform the 'sex' column
encoded_sex = encoder.fit_transform(df[['sex']])

# Create a DataFrame with the encoded data
encoded_sex_df = pd.DataFrame(encoded_sex, columns=encoder.get_feature_names_out(['sex']))

# Concatenate the encoded columns to the original DataFrame
df = pd.concat([df, encoded_sex_df], axis=1)

# Drop the original 'sex' column
df.drop('sex', axis=1, inplace=True)



####View datas

In [12]:
df.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,embarked,sex_female,sex_male
0,0,3,22.0,1,0,7.25,S,0.0,1.0
1,1,1,38.0,1,0,71.2833,C,1.0,0.0
2,1,3,26.0,0,0,7.925,S,1.0,0.0
3,1,1,35.0,1,0,53.1,S,1.0,0.0
4,0,3,35.0,0,0,8.05,S,0.0,1.0


####One hot encoder in 'Embarked' column

In [13]:
from sklearn.preprocessing import OneHotEncoder
# Create an instance of OneHotEncoder
encoder = OneHotEncoder(sparse=False)  # drop='first' to avoid multicollinearity

# Fit and transform the 'sex' column
encoded_embarked = encoder.fit_transform(df[['embarked']])

# Create a DataFrame with the encoded data
encoded_embarked_df = pd.DataFrame(encoded_embarked, columns=encoder.get_feature_names_out(['embarked']))

# Concatenate the encoded columns to the original DataFrame
df = pd.concat([df, encoded_embarked_df], axis=1)

# Drop the original 'sex' column
df.drop('embarked', axis=1, inplace=True)



####View datas

In [14]:
df

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
0,0,3,22.000000,1,0,7.2500,0.0,1.0,0.0,0.0,1.0
1,1,1,38.000000,1,0,71.2833,1.0,0.0,1.0,0.0,0.0
2,1,3,26.000000,0,0,7.9250,1.0,0.0,0.0,0.0,1.0
3,1,1,35.000000,1,0,53.1000,1.0,0.0,0.0,0.0,1.0
4,0,3,35.000000,0,0,8.0500,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,27.000000,0,0,13.0000,0.0,1.0,0.0,0.0,1.0
887,1,1,19.000000,0,0,30.0000,1.0,0.0,0.0,0.0,1.0
888,0,3,29.699118,1,2,23.4500,1.0,0.0,0.0,0.0,1.0
889,1,1,26.000000,0,0,30.0000,0.0,1.0,1.0,0.0,0.0


####Separate Input column & Target Column


In [15]:
from sklearn.model_selection import train_test_split

# Select the features (X) and target (y)
X = df.drop(columns=['survived'])  # Features
y = df['survived']                # Target

####Test & Train Split

In [16]:
# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

####Decision Tree Classifier

In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Create and train the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)



# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

In [18]:
# Print evaluation results
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:")
print(report)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.79
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.81      0.82       105
           1       0.74      0.76      0.75        74

    accuracy                           0.79       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179

Confusion Matrix:
[[85 20]
 [18 56]]
