In [1]:
import pandas as pd

# Load your dataset into a DataFrame
df = pd.read_csv('/kaggle/input/titanic/train.csv')

# Check for missing values
print(df.isnull().sum())   

df['Age'].fillna(df['Age'].mean(), inplace=True)
df = pd.get_dummies(df, columns=['Sex', 'Pclass', 'Embarked'], drop_first=True)
columns_to_drop = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked_Q','Embarked_S']
df = df.drop(columns=columns_to_drop)  
df['Fcount'] = df['SibSp']+df['Parch']
print(df.head())  
df = df.drop(columns = ['SibSp','Parch'])

from sklearn.preprocessing import StandardScaler 

scaler = StandardScaler() 
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])  

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
   Survived   Age  SibSp  Parch     Fare  Sex_male  Pclass_2  Pclass_3  Fcount
0         0  22.0      1      0   7.2500      True     False      True       1
1         1  38.0      1      0  71.2833     False     False     False       1
2         1  26.0      0      0   7.9250     False     False      True       0
3         1  35.0      1      0  53.1000     False     False     False       1
4         0  35.0      0      0   8.0500      True     False      True       0




In [2]:
print(df.head())  

   Survived       Age      Fare  Sex_male  Pclass_2  Pclass_3  Fcount
0         0 -0.592481 -0.502445      True     False      True       1
1         1  0.638789  0.786845     False     False     False       1
2         1 -0.284663 -0.488854     False     False      True       0
3         1  0.407926  0.420730     False     False     False       1
4         0  0.407926 -0.486337      True     False      True       0


In [3]:
#split dataset for training and testing 
from sklearn.model_selection import train_test_split    

X = df.drop('Survived', axis=1) 
y = df['Survived'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

Logistic Regression Model: 

In [4]:
# Training

from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train, y_train) 

# testing

from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Get other classification metrics
print(classification_report(y_test, y_pred)) 

Accuracy: 0.80
              precision    recall  f1-score   support

           0       0.80      0.88      0.84       105
           1       0.80      0.69      0.74        74

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179



Random Forest Model: 

In [5]:
# Training

#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier
#Create a Gaussian Classifier
model2=RandomForestClassifier(n_estimators=100)
#Train the model using the training sets 
model2.fit(X_train,y_train)

# testing

from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the test data
y_pred = model2.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Get other classification metrics
print(classification_report(y_test, y_pred)) 


Accuracy: 0.80
              precision    recall  f1-score   support

           0       0.82      0.86      0.84       105
           1       0.78      0.73      0.76        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.80       179
weighted avg       0.80      0.80      0.80       179



Naive Bayesian Model:

In [6]:
#training

from sklearn.naive_bayes import GaussianNB

model3 = GaussianNB()

model3.fit(X_train, y_train)

# testing

from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the test data
y_pred = model3.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Get other classification metrics
print(classification_report(y_test, y_pred)) 

Accuracy: 0.79
              precision    recall  f1-score   support

           0       0.81      0.84      0.82       105
           1       0.76      0.72      0.74        74

    accuracy                           0.79       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179



KNN: 

In [7]:
# Training

from sklearn.neighbors import KNeighborsClassifier

k = 4
model4 = KNeighborsClassifier(n_neighbors=k)

# Train the model
model4.fit(X_train, y_train)

# testing

from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the test data
y_pred = model4.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Get other classification metrics
print(classification_report(y_test, y_pred)) 

Accuracy: 0.82
              precision    recall  f1-score   support

           0       0.82      0.89      0.85       105
           1       0.82      0.72      0.76        74

    accuracy                           0.82       179
   macro avg       0.82      0.80      0.81       179
weighted avg       0.82      0.82      0.81       179



In [9]:
# ensembling

from sklearn.ensemble import VotingClassifier
ensemble_model = VotingClassifier(
    estimators=[
        ('Random Forest', model2),
        ('K-Nearest Neighbors', model4),
        ('Naive Bayes', model3), 
        ('Logistic Regression', model)
    ],
    voting='hard'  
)
ensemble_model.fit(X_train, y_train)

# testing
from sklearn.metrics import accuracy_score, classification_report

# Make predictions on the test data
y_pred = ensemble_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Get other classification metrics
print(classification_report(y_test, y_pred)) 

Accuracy: 0.80
              precision    recall  f1-score   support

           0       0.79      0.90      0.84       105
           1       0.83      0.66      0.74        74

    accuracy                           0.80       179
   macro avg       0.81      0.78      0.79       179
weighted avg       0.81      0.80      0.80       179

