In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv('/home/roots/Desktop/my_git_repo/MLlearning/datasets/titanic.csv')
print(df.columns.tolist())


['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']


In [28]:
# Check for missing values
print("Missing values in each column:")
print(df.isnull().sum())

Missing values in each column:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [29]:
# Drop Cabin column
df.drop(columns=['Cabin'], inplace=True)


In [10]:
# Define features and target
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']


In [30]:
# Step 5: Prepare feature matrix X and target vector y
X = df[features]
y = df['Survived']

In [31]:
# Convert categorical variables ('Sex', 'Embarked') to dummy variables
X = pd.get_dummies(X, columns=['Sex', 'Embarked'], drop_first=True)

In [32]:

# Split dataset into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)


In [33]:
# Initialize Random Forest classifier and train
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
model.fit(X_train, y_train)

# n_estimators=100:-
#     This means the forest will consist of 100 decision trees. More trees generally improve performance but increase training time.
#
# max_depth=5:-
#     This limits each decision tree to a maximum depth of 5 levels. It controls how complex each tree can grow, helping prevent overfitting
#     (too much memorization of training data).
#
# random_state=42:
#     Sets the seed for the random number generator to 42 (an arbitrary number). This makes the results reproducible — if you run the code again, you'll get 
#     the same model and predictions.
#
#     How does random_state help?
#         - random_state is a parameter that sets the seed for the random number generator.
#         - A seed is like the "starting point" for generating random numbers.
#         - When you specify random_state=42, you tell the algorithm:
#             - “Use this fixed starting point to generate all random choices.”
#             - So the sequence of random decisions (like which data points and features to pick) will always be the same.


In [34]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [35]:
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
#print(classification_report(y_test, y_pred))


Accuracy: 0.8134328358208955


In [36]:
def predict_survival(model, input_data):
    """
    Predict Titanic survival using the trained Random Forest model.
    
    Parameters:
        model: Trained RandomForestClassifier
        input_data: dict 
    
    Returns:
        prediction: int (0 = did not survive, 1 = survived)
    """
    # Convert input_data dict to DataFrame if needed
    if isinstance(input_data, dict):
        input_df = pd.DataFrame([input_data])
    else:
        input_df = input_data.copy()
    
    # One-hot encode categorical variables same as training
    input_df = pd.get_dummies(input_df, columns=['Sex', 'Embarked'], drop_first=True)
    
    # Make sure all expected columns exist (add missing columns with 0)
    for col in model.feature_names_in_:
        if col not in input_df.columns:
            input_df[col] = 0
    
    # Reorder columns to match training features
    input_df = input_df[model.feature_names_in_]
    
    # Predict and return the result
    prediction = model.predict(input_df)
    return prediction[0]


In [43]:
new_passenger1 = {
    'Pclass': 2,
    'Sex': 'male',
    'Age': 25,
    'SibSp': 3,
    'Parch': 2,
    'Fare': 7.25,
    'Embarked': 'S'
}

new_passenger2 = {
    'Pclass': 3,        # Passenger Class.
    'Sex': 'male',
    'Age': 7,
    'SibSp': 3,         # Number of siblings or spouses the passenger
    'Parch': 2,         # Number of parents or children
    'Fare': 15.2458,
    'Embarked': 'Q'     #The port where the passenger boarded the Titanic.
}

result = predict_survival(model, new_passenger1)
print("Survived" if result == 1 else "Did not survive")


Survived
