In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv("data/train.csv", delimiter=",")
df_test = pd.read_csv("data/test.csv", delimiter=",")

In [3]:
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
df_test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [5]:
df_train['Embarked'] = df_train['Embarked'].fillna(df_train['Embarked'].mode()[0],)
df_train['Age'] = df_train['Age'].fillna(df_train['Age'].median())
df_test['Embarked'] = df_test['Embarked'].fillna(df_train['Embarked'].mode()[0],)
df_test['Age'] = df_test['Age'].fillna(df_train['Age'].median())
df_test['Fare'] = df_test['Fare'].fillna(df_train['Fare'].median())

In [6]:
x_train_wName = df_train[['Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked']]
X = df_train[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]

y = df_train['Survived']

In [7]:
X

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.2500,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.9250,S
3,1,female,35.0,1,0,53.1000,S
4,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,male,27.0,0,0,13.0000,S
887,1,female,19.0,0,0,30.0000,S
888,3,female,28.0,1,2,23.4500,S
889,1,male,26.0,0,0,30.0000,C


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [9]:
x_train_encoded = pd.get_dummies(X, columns=['Pclass','Sex', 'Embarked'], drop_first=True).astype(int)
# X = X.drop(['Pclass','Sex', 'Embarked'], axis=1)
# X = pd.concat([X, x_train_encoded], axis=1)

In [10]:
print("X_train_encoded:\n", x_train_encoded)

X_train_encoded:
      Age  SibSp  Parch  Fare  Pclass_2  Pclass_3  Sex_male  Embarked_Q  \
0     22      1      0     7         0         1         1           0   
1     38      1      0    71         0         0         0           0   
2     26      0      0     7         0         1         0           0   
3     35      1      0    53         0         0         0           0   
4     35      0      0     8         0         1         1           0   
..   ...    ...    ...   ...       ...       ...       ...         ...   
886   27      0      0    13         1         0         1           0   
887   19      0      0    30         0         0         0           0   
888   28      1      2    23         0         1         0           0   
889   26      0      0    30         0         0         1           0   
890   32      0      0     7         0         1         1           1   

     Embarked_S  
0             1  
1             0  
2             1  
3             1  
4  

In [None]:
scaler = StandardScaler() # Choose an appropriate scaler
numerical_cols = x_train_encoded.select_dtypes(include=['number']).columns # Select numerical features for scaling
x_train_encoded[numerical_cols] = scaler.fit_transform(x_train_encoded[numerical_cols])

# Logistic Regression with L1 regularization
model = LogisticRegression(penalty='elasticnet', solver='saga', C=1.0, l1_ratio=0.5, random_state = 42)  # Adjust C (inverse of regularization strength)
                                                     # liblinear is a good solver for L1 penalty

# Train the model
model.fit(x_train_encoded, y)
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
scores_stratified = cross_val_score(model, x_train_encoded, y, cv=skf, scoring='f1') # 'f1' is often a good choice for imbalanced data
print("Stratified Cross-validation scores:", scores_stratified)

Stratified Cross-validation scores: [0.73542601 0.70192308 0.75221239]


In [13]:
x_score = df_test[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]


x_score_encoded = pd.get_dummies(x_score, columns=['Pclass','Sex', 'Embarked'], drop_first=True).astype(int)

# Align columns: add missing columns in x_score with NaN
x_score_encoded = x_score_encoded.reindex(columns=x_train_encoded.columns, fill_value=0)

x_score_encoded[numerical_cols] = scaler.transform(x_score_encoded[numerical_cols]) # Scale test data using the same scaler

# x_score = x_score.drop(['Pclass','Sex', 'Embarked'], axis=1)
# x_score = pd.concat([x_score, x_score_encoded], axis=1)

In [None]:
y_pred = model.predict(x_score_encoded)

In [15]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [16]:
submission = pd.Series(y_pred, index=df_test['PassengerId'], name='Survived').reset_index()

In [17]:
submission.to_csv('pred_result/submission.csv', index=False)

In [None]:
submission