In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import joblib
import warnings
warnings.filterwarnings('ignore')

print("âœ… Libraries loaded successfully!")

âœ… Libraries loaded successfully!


In [2]:
# Load the Titanic data
df = pd.read_csv('train.csv')

print("Dataset loaded! Here are the first 5 rows:")
print(df.head())

print(f"\nDataset has {df.shape[0]} passengers and {df.shape[1]} features")

Dataset loaded! Here are the first 5 rows:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0    

In [3]:
# Keep only the features we're allowed to use
allowed_features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Survived']

df = df[allowed_features]

print("âœ… Selected only the allowed features")
print(df.head())
print(f"\nMissing values:\n{df.isnull().sum()}")

âœ… Selected only the allowed features
   Pclass     Sex   Age  SibSp  Parch     Fare Embarked  Survived
0       3    male  22.0      1      0   7.2500        S         0
1       1  female  38.0      1      0  71.2833        C         1
2       3  female  26.0      0      0   7.9250        S         1
3       1  female  35.0      1      0  53.1000        S         1
4       3    male  35.0      0      0   8.0500        S         0

Missing values:
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
Survived      0
dtype: int64


In [4]:
# Fill missing Age with median
df['Age'] = df['Age'].fillna(df['Age'].median())

# Fill missing Fare with median
df['Fare'] = df['Fare'].fillna(df['Fare'].median())

# Fill missing Embarked with the most common value
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# Remove any remaining rows with missing data
df = df.dropna()

print("âœ… Missing data handled!")
print(f"Clean dataset has {df.shape[0]} passengers")
print(f"\nMissing values now:\n{df.isnull().sum()}")

âœ… Missing data handled!
Clean dataset has 891 passengers

Missing values now:
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
Survived    0
dtype: int64


In [5]:
# YOU CHOOSE: Pick any 5 features from the 7 available
# I'm picking these 5, but you can change them:

selected_features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']

# X = features we use to predict
# y = survival status (0 = died, 1 = survived)
X = df[selected_features].copy()
y = df['Survived']

print("âœ… Selected these 5 features:")
for i, feature in enumerate(selected_features, 1):
    print(f"  {i}. {feature}")
    
print(f"\nTarget variable: Survived (0 = No, 1 = Yes)")

âœ… Selected these 5 features:
  1. Pclass
  2. Sex
  3. Age
  4. Fare
  5. Embarked

Target variable: Survived (0 = No, 1 = Yes)


In [6]:
# Convert 'Sex' to numbers (male=1, female=0)
sex_encoder = LabelEncoder()
X['Sex'] = sex_encoder.fit_transform(X['Sex'])

# Convert 'Embarked' to numbers (C=0, Q=1, S=2)
embarked_encoder = LabelEncoder()
X['Embarked'] = embarked_encoder.fit_transform(X['Embarked'])

# Save encoders for later use
joblib.dump(sex_encoder, 'sex_encoder.pkl')
joblib.dump(embarked_encoder, 'embarked_encoder.pkl')

print("âœ… Encoded categorical variables")
print("\nSex encoding: female=0, male=1")
print(f"Embarked encoding: {dict(zip(embarked_encoder.classes_, embarked_encoder.transform(embarked_encoder.classes_)))}")
print("\nData preview:")
print(X.head())

âœ… Encoded categorical variables

Sex encoding: female=0, male=1
Embarked encoding: {'C': np.int64(0), 'Q': np.int64(1), 'S': np.int64(2)}

Data preview:
   Pclass  Sex   Age     Fare  Embarked
0       3    1  22.0   7.2500         2
1       1    0  38.0  71.2833         0
2       3    0  26.0   7.9250         2
3       1    0  35.0  53.1000         2
4       3    1  35.0   8.0500         2


In [7]:
# Split: 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("âœ… Data split complete!")
print(f"Training set: {X_train.shape[0]} passengers")
print(f"Testing set: {X_test.shape[0]} passengers")

âœ… Data split complete!
Training set: 712 passengers
Testing set: 179 passengers


In [8]:
# Create and train Logistic Regression model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

print("âœ… Model training complete!")
print("The model has learned to predict survival!")

âœ… Model training complete!
The model has learned to predict survival!


In [9]:
# Predict on test data
y_pred = model.predict(X_test)

print("âœ… Predictions made on test data")
print("\nExample predictions:")
for i in range(5):
    actual = "Survived" if y_test.iloc[i] == 1 else "Did Not Survive"
    predicted = "Survived" if y_pred[i] == 1 else "Did Not Survive"
    print(f"Passenger {i+1}: Actual = {actual}, Predicted = {predicted}")

âœ… Predictions made on test data

Example predictions:
Passenger 1: Actual = Survived, Predicted = Did Not Survive
Passenger 2: Actual = Did Not Survive, Predicted = Did Not Survive
Passenger 3: Actual = Did Not Survive, Predicted = Did Not Survive
Passenger 4: Actual = Survived, Predicted = Survived
Passenger 5: Actual = Survived, Predicted = Survived


In [10]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

print("ðŸ“Š MODEL PERFORMANCE:")
print("="*60)
print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print("="*60)

print("\nðŸ“‹ CLASSIFICATION REPORT:")
print(classification_report(y_test, y_pred, target_names=['Did Not Survive', 'Survived']))

print("\nðŸ“Š CONFUSION MATRIX:")
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("\nInterpretation:")
print(f"  Correctly predicted deaths: {cm[0][0]}")
print(f"  Correctly predicted survivals: {cm[1][1]}")
print(f"  Incorrectly predicted deaths: {cm[1][0]}")
print(f"  Incorrectly predicted survivals: {cm[0][1]}")

ðŸ“Š MODEL PERFORMANCE:
Accuracy: 0.7989 (79.89%)

ðŸ“‹ CLASSIFICATION REPORT:
                 precision    recall  f1-score   support

Did Not Survive       0.82      0.84      0.83       105
       Survived       0.76      0.74      0.75        74

       accuracy                           0.80       179
      macro avg       0.79      0.79      0.79       179
   weighted avg       0.80      0.80      0.80       179


ðŸ“Š CONFUSION MATRIX:
[[88 17]
 [19 55]]

Interpretation:
  Correctly predicted deaths: 88
  Correctly predicted survivals: 55
  Incorrectly predicted deaths: 19
  Incorrectly predicted survivals: 17


In [11]:
# Save the trained model
joblib.dump(model, 'titanic_survival_model.pkl')

print("âœ… Model saved as 'titanic_survival_model.pkl'")
print("âœ… Sex encoder saved as 'sex_encoder.pkl'")
print("âœ… Embarked encoder saved as 'embarked_encoder.pkl'")
print("\nðŸŽ‰ PART A COMPLETE!")

âœ… Model saved as 'titanic_survival_model.pkl'
âœ… Sex encoder saved as 'sex_encoder.pkl'
âœ… Embarked encoder saved as 'embarked_encoder.pkl'

ðŸŽ‰ PART A COMPLETE!


In [12]:
# Reload and test the model
loaded_model = joblib.load('titanic_survival_model.pkl')
loaded_sex_encoder = joblib.load('sex_encoder.pkl')
loaded_embarked_encoder = joblib.load('embarked_encoder.pkl')

# Test with a sample passenger
# Pclass=3, Sex=male, Age=22, Fare=7.25, Embarked=S
test_passenger = pd.DataFrame({
    'Pclass': [3],
    'Sex': loaded_sex_encoder.transform(['male']),
    'Age': [22],
    'Fare': [7.25],
    'Embarked': loaded_embarked_encoder.transform(['S'])
})

prediction = loaded_model.predict(test_passenger)[0]
result = "Survived" if prediction == 1 else "Did Not Survive"

print("âœ… Model reloaded successfully!")
print(f"\nTest Passenger Prediction: {result}")

âœ… Model reloaded successfully!

Test Passenger Prediction: Did Not Survive
