In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score

# Load the dataset
titanic_data = pd.read_csv(r"C:\Users\sumit bhot\Downloads\tatanic data set\Titanic-Dataset.csv")

# Task 1: Split the dataset into features (X) and target variable (y)
X = titanic_data.drop('Survived', axis=1)
y = titanic_data['Survived']
print("Features (X):")
print(X.head())
print("\nTarget variable (y):")
print(y.head())

# Task 2: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("\nTraining data shape:", X_train.shape)
print("Test data shape:", X_test.shape)

# Task 3: Define the preprocessing steps for numerical and categorical features separately
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Task 4: Combine the preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, X.select_dtypes(include=['int64', 'float64']).columns),
        ('cat', categorical_transformer, X.select_dtypes(include=['object']).columns)
    ])
print("\nPreprocessor details:")
print(preprocessor)

# Task 5: Define the pipeline with feature selection, preprocessing, and the classifier (Random Forest)
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])
print("\nPipeline details:")
print(pipeline)

# Task 6: Fit the model on the training data
pipeline.fit(X_train, y_train)
print("\nModel fitted successfully.")

# Task 7: Make predictions on the test data
y_pred = pipeline.predict(X_test)
print("\nPredictions made successfully.")

# Task 8: Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("\nAccuracy:", accuracy)


Features (X):
   PassengerId  Pclass                                               Name  \
0            1       3                            Braund, Mr. Owen Harris   
1            2       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2            3       3                             Heikkinen, Miss. Laina   
3            4       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4            5       3                           Allen, Mr. William Henry   

      Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked  
0    male  22.0      1      0         A/5 21171   7.2500   NaN        S  
1  female  38.0      1      0          PC 17599  71.2833   C85        C  
2  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN        S  
3  female  35.0      1      0            113803  53.1000  C123        S  
4    male  35.0      0      0            373450   8.0500   NaN        S  

Target variable (y):
0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: