In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

In [2]:

# Load the train data from the file
train_data = pd.read_csv('/Users/lok/titanic/data/raw/train.csv')

# Display the first few rows of the train dataset
train_data.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Load the training data
train_data = pd.read_csv('/Users/lok/titanic/data/raw/train.csv')

# Drop columns that are less likely to be useful
train_data_cleaned = train_data.drop(['Name', 'Ticket', 'Cabin'], axis=1)

# Impute missing values for 'Age' with mean
imputer = SimpleImputer(strategy='mean')
train_data_cleaned['Age'] = imputer.fit_transform(train_data_cleaned[['Age']])

# Impute missing values for 'Embarked' with the most frequent value
imputer_mode = SimpleImputer(strategy='most_frequent')
train_data_cleaned['Embarked'] = imputer_mode.fit_transform(train_data_cleaned[['Embarked']]).ravel()

# Encode categorical variables 'Sex' and 'Embarked'
encoder_sex = LabelEncoder()
encoder_embarked = LabelEncoder()
train_data_cleaned['Sex'] = encoder_sex.fit_transform(train_data_cleaned['Sex'])
train_data_cleaned['Embarked'] = encoder_embarked.fit_transform(train_data_cleaned['Embarked'])

# Split the data into features and target variable
X = train_data_cleaned.drop('Survived', axis=1)
y = train_data_cleaned['Survived']

# Split the dataset into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the first few entries to confirm the transformations
print(train_data_cleaned.head())


   PassengerId  Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0            1         0       3    1  22.0      1      0   7.2500         2
1            2         1       1    0  38.0      1      0  71.2833         0
2            3         1       3    0  26.0      0      0   7.9250         2
3            4         1       1    0  35.0      1      0  53.1000         2
4            5         0       3    1  35.0      0      0   8.0500         2


In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Initialize the models
logreg = LogisticRegression(max_iter=200)
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier(n_estimators=100)
gradient_boosting = GradientBoostingClassifier()
svm = SVC()

# List of models
models = [
    ("Logistic Regression", logreg),
    ("Decision Tree", decision_tree),
    ("Random Forest", random_forest),
    ("Gradient Boosting", gradient_boosting),
    ("SVM", svm)
]

# Train and evaluate each model
results = []
for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_pred)
    results.append((name, accuracy))

results.sort(key=lambda x: x[1], reverse=True)  # Sort by accuracy
results


[('Random Forest', 0.8268156424581006),
 ('Gradient Boosting', 0.8268156424581006),
 ('Logistic Regression', 0.8100558659217877),
 ('Decision Tree', 0.7430167597765364),
 ('SVM', 0.5977653631284916)]

In [10]:
test_data = pd.read_csv('/Users/lok/titanic/data/raw/test.csv')

# Drop columns that are less likely to be useful
test_data_cleaned = test_data.drop(['Name', 'Ticket', 'Cabin'], axis=1)

# Impute missing values for 'Age' with the mean (use the same imputer as for the training data)
test_data_cleaned['Age'] = imputer.transform(test_data_cleaned[['Age']])

# If 'Fare' has missing values, impute with the mean (assuming the imputer is fitted to the training data)
test_data_cleaned['Fare'] = test_data_cleaned['Fare'].fillna(test_data_cleaned['Fare'].mean())

# Impute missing values for 'Embarked' with the most frequent value (use the same mode imputer)
test_data_cleaned['Embarked'] = imputer_mode.transform(test_data_cleaned[['Embarked']]).ravel()

# Encode categorical variables 'Sex' and 'Embarked' using the same encoders
test_data_cleaned['Sex'] = encoder_sex.transform(test_data_cleaned['Sex'])
test_data_cleaned['Embarked'] = encoder_embarked.transform(test_data_cleaned['Embarked'])


test_data_cleaned.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,1,34.5,0,0,7.8292,1
1,893,3,0,47.0,1,0,7.0,2
2,894,2,1,62.0,0,0,9.6875,1
3,895,3,1,27.0,0,0,8.6625,2
4,896,3,0,22.0,1,1,12.2875,2


In [11]:
# # Feature Engineering: Extracting Titles from names
# # Creating Family Size feature
# train_data_cleaned['FamilySize'] = train_data['SibSp'] + train_data['Parch'] + 1

# # Prepare the feature matrix and target vector
# X = train_data_cleaned.drop('Survived', axis=1)
# y = train_data_cleaned['Survived']

# # Split the data into training and validation sets
# X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# # Re-train the Gradient Boosting model with new features
# gradient_boosting.fit(X_train, y_train)
# y_pred = gradient_boosting.predict(X_valid)
# new_accuracy = accuracy_score(y_valid, y_pred)

# new_accuracy


In [12]:
# Predict using the Gradient Boosting model
test_predictions = gradient_boosting.predict(test_data_cleaned)

# Create a DataFrame with the results
submission_df = pd.DataFrame({
    "PassengerId": test_data_cleaned['PassengerId'],
    "Survived": test_predictions
})

# Save the DataFrame to a CSV file
submission_path = '/Users/lok/titanic/data/results.csv'
submission_df.to_csv(submission_path, index=False)

submission_df.head(), submission_path


(   PassengerId  Survived
 0          892         0
 1          893         0
 2          894         0
 3          895         0
 4          896         0,
 '/Users/lok/titanic/data/results.csv')

In [8]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Gradient Boosting
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2]
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=GradientBoostingClassifier(), param_grid=param_grid, 
                           cv=3, n_jobs=-1, scoring='accuracy', verbose=1)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_params, best_score


Fitting 3 folds for each of 72 candidates, totalling 216 fits


({'learning_rate': 0.05,
  'max_depth': 3,
  'min_samples_leaf': 1,
  'min_samples_split': 4,
  'n_estimators': 100},
 0.8189919242550822)

In [9]:
# Define a smaller parameter grid for Gradient Boosting to manage execution time
small_param_grid = {
    'n_estimators': [100, 150],
    'learning_rate': [0.1, 0.15],
    'max_depth': [3, 4],
    'min_samples_split': [2],
    'min_samples_leaf': [1]
}

# Initialize the GridSearchCV object with a smaller grid
grid_search = GridSearchCV(estimator=GradientBoostingClassifier(), param_grid=small_param_grid, 
                           cv=3, n_jobs=-1, scoring='accuracy', verbose=1)

# Fit GridSearchCV again with the smaller grid
grid_search.fit(X_train, y_train)

# Best parameters and best score from the smaller grid search
best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_params, best_score


Fitting 3 folds for each of 8 candidates, totalling 24 fits


({'learning_rate': 0.15,
  'max_depth': 3,
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 100},
 0.8102014294996751)