In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
# Load the training data
train_data = pd.read_csv('/content/drive/MyDrive/titanic/train.csv')

In [None]:
# Preprocessing: Extract features and handle missing values
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']
target = 'Survived'
train_data = train_data[features + [target]]
train_data['Sex'] = train_data['Sex'].map({'female': 0, 'male': 1})

# Imputation transformer for the Age feature
age_imputer = SimpleImputer(strategy='mean')

# One-hot encoding transformer for the Embarked feature
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')

# Column transformer for the Age and Embarked features
preprocessor = ColumnTransformer(transformers=[
    ('age_imputer', age_imputer, ['Age']),
    ('one_hot_encoder', one_hot_encoder, ['Embarked'])
], remainder='passthrough')

# Split the data into training and validation sets
X = train_data[features]
y = train_data[target]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the model
model = RandomForestClassifier(random_state=42)

# Define the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Hyperparameter tuning using grid search
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [None, 5, 10],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

stratified_kfold = StratifiedKFold(n_splits=5)

grid_search = GridSearchCV(pipeline, param_grid, cv=stratified_kfold, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Retrieve the best hyperparameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate the best model on the validation set
y_pred = best_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy}')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['Sex'] = train_data['Sex'].map({'female': 0, 'male': 1})


Validation Accuracy: 0.8268156424581006


In [None]:
# Load the test data
test_data = pd.read_csv('/content/drive/MyDrive/titanic/test.csv')

In [None]:
# Preprocessing: Handle missing values and transform features
test_data['Sex'] = test_data['Sex'].map({'female': 0, 'male': 1})
test_data = test_data[features].fillna(test_data[features].mean())

# Make predictions on the test set using the best model
test_predictions = best_model.predict(test_data)

  test_data = test_data[features].fillna(test_data[features].mean())


In [None]:
# Create a submission file
submission = pd.read_csv('/content/drive/MyDrive/titanic/gender_submission.csv')
submission['Survived'] = test_predictions
submission.to_csv('submission5.csv', index=False)