In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [2]:
# Import necessary libraries for data analysis and visualization.
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

# Load the Titanic dataset.
titanic_data = pd.read_csv('/kaggle/input/titanic/train.csv')

# Select relevant features and target variable.
features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(titanic_data[features])
y = titanic_data["Survived"]

# Split the data into a training set and a validation set for hyperparameter tuning.
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)

# Define the Random Forest model with default hyperparameters.
model = RandomForestClassifier(random_state=1)

# Define a grid of hyperparameters to search through.
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search to find the best combination of hyperparameters.
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters from the grid search.
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Train a new Random Forest model with the best hyperparameters.
best_model = RandomForestClassifier(random_state=1, **best_params)
best_model.fit(X_train, y_train)

# Make predictions on the validation set.
val_predictions = best_model.predict(X_val)

# Evaluate the model's performance on the validation set.
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_val, val_predictions)
print("Validation Set Accuracy:", accuracy)

# Finally, you can use the best model to make predictions on the test data and save the results.
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
X_test = pd.get_dummies(test_data[features])
test_predictions = best_model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': test_predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")


Best Hyperparameters: {'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Validation Set Accuracy: 0.7821229050279329
Your submission was successfully saved!


In [3]:
# Read the CSV file into a DataFrame
df = pd.read_csv("submission.csv")

# Display the first few rows of the DataFrame
print(df.head())

   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         1


In [4]:
print(os.getcwd())

/kaggle/working
