<a href="https://colab.research.google.com/github/yuko-ogura/titanic-kaggle/blob/main/20250906_Titanic_Kaggle_v3_27.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from google.colab import drive
from google.colab import files

# --- Mount Google Drive ---
drive.mount('/content/drive')

# --- Load data ---
train = pd.read_csv("/content/drive/My Drive/202509 Kaggle Titanic Project/train.csv")
test = pd.read_csv("/content/drive/My Drive/202509 Kaggle Titanic Project/test.csv")

# --- Fill missing values simply ---
train['Age'] = train['Age'].fillna(train['Age'].mean())
train['Embarked'] = train['Embarked'].fillna('S')
train['Fare'] = train['Fare'].fillna(train['Fare'].mean())

test['Age'] = test['Age'].fillna(train['Age'].mean())
test['Fare'] = test['Fare'].fillna(train['Fare'].mean())
test['Embarked'] = test['Embarked'].fillna('S')

# --- Combine train and test to create consistent features ---
dataset = pd.concat([train, test], ignore_index=True)

# --- Create Deck feature ---
dataset['Deck'] = dataset['Cabin'].astype(str).str[0].replace('n','U')

# --- Create TicketGroup feature ---
Ticket_Count = dict(dataset['Ticket'].value_counts())
dataset['TicketGroup'] = dataset['Ticket'].apply(lambda x: Ticket_Count[x])

def Ticket_Label(s):
    if 2 <= s <= 4: return 2
    elif (s > 4 and s <= 8) or s == 1: return 1
    elif s > 8: return 0

dataset['TicketGroup'] = dataset['TicketGroup'].apply(Ticket_Label)

# --- Split back into train and test ---
train = dataset[dataset['Survived'].notna()]
test = dataset[dataset['Survived'].isna()]

# --- Select features ---
features = ['Pclass','Sex','Age','Fare','Embarked', 'Deck', 'TicketGroup']

X = pd.get_dummies(train[features])
y = train['Survived']
X_test = pd.get_dummies(test[features])
X_test = X_test.reindex(columns=X.columns, fill_value=0)

# --- Model setup ---
clf = RandomForestClassifier(random_state=10, max_features='sqrt')
pipe = Pipeline([('classify', clf)])
param_test = {
    'classify__n_estimators': list(range(20, 30, 1)),
    'classify__max_depth': list(range(3, 10, 1))
}

grid = GridSearchCV(estimator=pipe, param_grid=param_test, scoring='accuracy', cv=10)
grid.fit(X, y)

print("Best params:", grid.best_params_)
print("Best CV score:", grid.best_score_)

# --- Predict with best model ---
predictions = grid.predict(X_test)

submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Survived": predictions.astype(int)
})
submission.to_csv("submission.csv", index=False)

# Download
files.download("submission.csv")


Mounted at /content/drive
Best params: {'classify__max_depth': 6, 'classify__n_estimators': 21}
Best CV score: 0.840661672908864


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [2]:
# Install git if not available
!apt-get install git -y

# Configure git (do this once per Colab runtime)
!git config --global user.name "yuko-ogura"
!git config --global user.email "bananamuffinz31@gmail.com"


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git is already the newest version (1:2.34.1-1ubuntu1.15).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [4]:
# Navigate to content folder
%cd /content

# Clone the repo
!git clone https://github.com/yuko-ogura/titanic-kaggle.git

# Move into it
%cd titanic-kaggle


/content
Cloning into 'titanic-kaggle'...
fatal: could not read Username for 'https://github.com': No such device or address
[Errno 2] No such file or directory: 'titanic-kaggle'
/content
