<a href="https://www.kaggle.com/code/wsessoms/hw1-skl-version?scriptVersionId=262716353" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# My Logistic Regression Model (SKL Pipeline Version)
## Final Submission Accuracy: .76794
## Final Submission Ranking: 8998

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
import seaborn as sns

# Read and copy data
train_df_ori = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df_ori = pd.read_csv('/kaggle/input/titanic/test.csv')

train_df = train_df_ori.copy()
test_df = test_df_ori.copy()

# Removing irrelevant features
irrelevant = ['PassengerId', 'Name']
train_df.drop(irrelevant, axis=1, inplace=True)
test_df.drop(irrelevant, axis=1, inplace=True)

# Removing features with too many missing values
missing = ['Cabin', 'Ticket']
train_df.drop(missing, axis=1, inplace=True)
test_df.drop(missing, axis=1, inplace=True)

# Assume missing age values
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
train_df['Age'] = mean_imputer.fit_transform(train_df['Age'].values.reshape(-1,1))
test_df['Age'] = mean_imputer.transform(test_df['Age'].values.reshape(-1,1))

train_df['Fare'] = mean_imputer.fit_transform(train_df['Fare'].values.reshape(-1,1))
test_df['Fare'] = mean_imputer.transform(test_df['Fare'].values.reshape(-1,1))

mf_imputer = SimpleImputer(strategy="most_frequent")
train_df[["Embarked"]] = mf_imputer.fit_transform(train_df[["Embarked"]])
test_df[["Embarked"]] = mf_imputer.transform(test_df[["Embarked"]])

y = train_df['Survived']
X = train_df.drop(columns=['Survived'])

rng1 = np.random.RandomState(seed=865)

# Split data into training/validation 85/15
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=rng1, stratify=y)

# Apply OHE and scaling
norm_scaler = ColumnTransformer(
    transformers=[
        ('category', OneHotEncoder(drop='first'), ['Sex', 'Embarked']),
        ('minmax', MinMaxScaler(), ['Pclass', 'Parch']),
        ('standard', StandardScaler(), ['Age', 'SibSp', 'Fare'])
    ]
)

# Create logistic regression pipeline and train
pipe = make_pipeline(norm_scaler, LogisticRegression(max_iter=1000))
pipe.fit(X_train, y_train)


# Predict on training set
y_hat = pipe.predict(X_train)
print(f"Model training accuracy is {accuracy_score(y_train, y_hat)*100:0.2f}%")
# Predict on validation set
y_hat = pipe.predict(X_val)
print(f"Model validation accuracy is {accuracy_score(y_val, y_hat)*100:0.2f}%")

# Training acc on seed=865: 80.05%
# Validation acc on seed=865: 79.10%

y_test_pred = pipe.predict(test_df)
submission = pd.DataFrame({
    "PassengerId": test_df_ori["PassengerId"],
    "Survived": y_test_pred
})

# Save as CSV for Kaggle submission
submission.to_csv("submission.csv", index=False)
print("submission.csv created")

Model training accuracy is 80.05%
Model validation accuracy is 79.10%
submission.csv created
