# Logistic Regression

In [20]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer


In [21]:
fold = 0

In [22]:
# Load the full training data with folds
df = pd.read_csv("../input/train_folds.csv")

# Drop the PassengerId column
df.drop(columns=['PassengerId'], inplace=True)

In [23]:
# Create a new columns from the split values of the Cabin column
df[['Cabin_Deck', 'Cabin_Number', 'Cabin_Side']] = df.Cabin.str.split("/", expand=True)
df.drop(columns=['Cabin'], inplace=True)

# Convert the Number column to numeric
df['Cabin_Number'] = pd.to_numeric(df['Cabin_Number'], errors='coerce')

In [24]:
# Split the data into training and validation
df_train = df[df.kfold != fold].reset_index(drop=True)
df_valid = df[df.kfold == fold].reset_index(drop=True)

# Drop the kfold column
df_train = df_train.drop(columns=['kfold'])
df_valid = df_valid.drop(columns=['kfold'])

In [25]:
# Numerical columns
num_cols = df.select_dtypes(include=np.number).columns.tolist()
cat_cols = df.select_dtypes(include='object').columns.tolist()

num_cols.remove('kfold')

In [26]:
# Separate the features and target
X_train = df_train.drop('Transported', axis=1)
X_valid = df_valid.drop('Transported', axis=1)
y_train = df_train.Transported
y_valid = df_valid.Transported

In [27]:
# Impute missing values
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')
X_train[num_cols] = num_imputer.fit_transform(X_train[num_cols])
X_valid[num_cols] = num_imputer.transform(X_valid[num_cols])
X_train[cat_cols] = cat_imputer.fit_transform(X_train[cat_cols])
X_valid[cat_cols] = cat_imputer.transform(X_valid[cat_cols])

In [28]:
# Scale the numerical columns
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_valid[num_cols] = scaler.transform(X_valid[num_cols])

In [29]:
# One-hot encode the categorical columns
ohe = OneHotEncoder()

# Fit ohe on training + validation features
full_data = pd.concat(
    [X_train[cat_cols], X_valid[cat_cols]],
    axis=0
)
ohe.fit(full_data)

# Transform training and validation features
X_train_ohe = ohe.transform(X_train[cat_cols]).toarray()
X_valid_ohe = ohe.transform(X_valid[cat_cols]).toarray()

# Create the final dataset
X_train = np.hstack((X_train[num_cols].values, X_train_ohe))
X_valid = np.hstack((X_valid[num_cols].values, X_valid_ohe))


In [30]:
# Initialize the model
model = LogisticRegression()

# Fit the model on training data
model.fit(X_train, y_train)

# Predict on the validation data
preds = model.predict(X_valid)

In [31]:
# Calculate the accuracy
accuracy = metrics.accuracy_score(y_valid, preds)
print(f"Fold={fold}, Accuracy={accuracy}")

Fold=0, Accuracy=0.7717078780908568
