In [1]:
!pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import pandas as pd

# Load the dataset
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
titanic_data = pd.read_csv(url)
titanic_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Retain PassengerId for tracking purposes
passenger_ids = titanic_data['PassengerId']

titanic_data['Age'] = titanic_data['Age'].fillna(titanic_data['Age'].median())

titanic_data['Embarked'] = titanic_data['Embarked'].fillna('S')

# Convert 'Sex' to numeric: male=0, female=1
titanic_data['Sex'] = titanic_data['Sex'].map({'male': 0, 'female': 1})


In [4]:
# Drop irrelevant columns
titanic_data.drop(['Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

In [5]:
# One-hot encode the 'Embarked' column
titanic_data = pd.get_dummies(titanic_data, columns=['Embarked'], drop_first=True)

In [6]:
from sklearn.model_selection import train_test_split

# Features (X) and labels (y)
X = titanic_data.drop(['Survived', 'PassengerId'], axis=1)  # Exclude PassengerId from the model
y = titanic_data['Survived']

# Split the dataset into training and test sets, keeping PassengerId for test set tracking
X_train, X_test, y_train, y_test, passenger_ids_train, passenger_ids_test = train_test_split(
    X, y, passenger_ids, test_size=0.2, random_state=42)

In [7]:
# Import LogisticRegression
from sklearn.linear_model import LogisticRegression

# Initialize the LogisticRegression model with higher max_iter
model = LogisticRegression(max_iter=5000)  # You can increase this number further if needed

# Train the model
model.fit(X_train, y_train)

In [8]:
from sklearn.metrics import accuracy_score

# Make predictions on the test set
y_pred = model.predict(X_test)

# Add PassengerId back to track predictions
predictions_with_id = pd.DataFrame({
    'PassengerId': passenger_ids_test,
    'Prediction': y_pred
})

# Output the predictions with PassengerId
print(predictions_with_id.head())

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy * 100:.2f}%')


     PassengerId  Prediction
709          710           0
439          440           0
840          841           0
720          721           1
39            40           1
Model Accuracy: 81.01%


In [9]:
import joblib

# Save the model to a file
joblib.dump(model, 'titanic_logistic_model.pkl')

['titanic_logistic_model.pkl']

In [10]:
import joblib
import numpy as np
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

# Load the trained model
model = joblib.load('titanic_logistic_model.pkl')

# Define the initial type for the ONNX model (this corresponds to the input shape)
# Assuming the model expects a feature vector with the number of columns in X_train (i.e., 7 or 8)
initial_type = [('float_input', FloatTensorType([None, X_train.shape[1]]))]

# Convert the model to ONNX
onnx_model = convert_sklearn(model, initial_types=initial_type)

# Save the ONNX model to a file
with open("titanic_logistic_model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())


In [11]:
import onnxruntime as rt
import numpy as np
import joblib

# Load the original scikit-learn model for comparison
sklearn_model = joblib.load('titanic_logistic_model.pkl')

# Load the ONNX model
onnx_model_path = "titanic_logistic_model.onnx"
sess = rt.InferenceSession(onnx_model_path)

# Prepare input data
# For example, we can take a single sample data point from your dataset
input_data = np.array([[3, 0, 22, 1, 0, 7.25, 0, 1]], dtype=np.float32)

# Get the input name for the ONNX model
input_name = sess.get_inputs()[0].name

# Get the output name for the ONNX model
output_name = sess.get_outputs()[0].name

# Run inference on the ONNX model
onnx_prediction = sess.run([output_name], {input_name: input_data})

print(f"ONNX Model Prediction: {onnx_prediction[0][0]}")

# Run inference using the original scikit-learn model to compare
sklearn_prediction = sklearn_model.predict(input_data)

print(f"Scikit-learn Model Prediction: {sklearn_prediction[0]}")


ONNX Model Prediction: 0
Scikit-learn Model Prediction: 0


