In [11]:
import pandas as pd

# Load your dataset (replace 'your_data.csv' with your actual file path)
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')



In [12]:
print(train_data.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [13]:
print(train_data.columns)

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [14]:
print(test_data.head())

   PassengerId  Pclass                                          Name     Sex  \
0          892       3                              Kelly, Mr. James    male   
1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   
2          894       2                     Myles, Mr. Thomas Francis    male   
3          895       3                              Wirz, Mr. Albert    male   
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   

    Age  SibSp  Parch   Ticket     Fare Cabin Embarked  
0  34.5      0      0   330911   7.8292   NaN        Q  
1  47.0      1      0   363272   7.0000   NaN        S  
2  62.0      0      0   240276   9.6875   NaN        Q  
3  27.0      0      0   315154   8.6625   NaN        S  
4  22.0      1      1  3101298  12.2875   NaN        S  


In [15]:
print(test_data.columns)

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [16]:
# Mount Google Drive (if the model is saved there)
from google.colab import drive
drive.mount('/content/drive')

# Define the path to the saved model
model_path = '/content/drive/MyDrive/models/best_model.pkl'  # Update this path as necessary


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
import os
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Load the trained model
try:
    model = joblib.load(model_path)
    print("Model loaded successfully.")
except FileNotFoundError:
    print(f"Model file not found at '{model_path}'. Please check the path and try again.")
    # Exit if the model cannot be loaded
    import sys
    sys.exit()

# Define a function to preprocess new input data
def preprocess_input(input_data):
    """
    Preprocess the input data to match the training data format.

    Parameters:
    - input_data (dict): Dictionary containing input features.

    Returns:
    - pd.DataFrame: Preprocessed input features.
    """
    # Convert input dictionary to DataFrame
    input_df = pd.DataFrame([input_data])

    # Add 'PassengerId' if not present
    if 'PassengerId' not in input_df.columns:
        input_df['PassengerId'] = 0  # Dummy value

    # Add 'Age' if not present
    if 'Age' not in input_df.columns:
        input_df['Age'] = 30  # Replace with mean Age from training data if available

    # One-hot encode categorical variables ('Sex' and 'Embarked')
    input_df = pd.get_dummies(input_df, columns=['Sex', 'Embarked'], drop_first=True)

    # Ensure all necessary columns are present
    expected_cols = ['Pclass', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Embarked_Q', 'Embarked_S', 'Age', 'PassengerId']
    for col in expected_cols:
        if col not in input_df.columns:
            input_df[col] = 0  # Add missing columns with default value 0

    # Reorder columns to match training data
    input_df = input_df[expected_cols]

    return input_df

# Example input including 'Age' and 'PassengerId'
example_input = {
    'PassengerId': 0,  # Dummy value
    'Pclass': 3,
    'SibSp': 1,
    'Parch': 0,
    'Fare': 7.25,
    'Sex': 'male',
    'Embarked': 'S',
    'Age': 30  # Example age
}

# Preprocess the input
processed_input = preprocess_input(example_input)

# Make prediction
try:
    prediction = model.predict(processed_input)
    prediction_proba = model.predict_proba(processed_input)[:, 1]

    print(f"Predicted Survival: {'Survived' if prediction[0] == 1 else 'Did Not Survive'}")
    print(f"Survival Probability: {prediction_proba[0]:.2f}")
except ValueError as ve:
    print(f"ValueError during prediction: {ve}")
except Exception as e:
    print(f"An unexpected error occurred during prediction: {e}")

Model loaded successfully.
ValueError during prediction: The feature names should match those that were passed during fit.
Feature names must be in the same order as they were in fit.



In [20]:
print(model.feature_names_in_)


['PassengerId' 'Pclass' 'Age' 'SibSp' 'Parch' 'Fare' 'Sex_male'
 'Embarked_Q' 'Embarked_S']


In [21]:
print(processed_input.columns)


Index(['Pclass', 'SibSp', 'Parch', 'Fare', 'Sex_male', 'Embarked_Q',
       'Embarked_S', 'Age', 'PassengerId'],
      dtype='object')
