In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Embarked']

X = pd.get_dummies(train_data[features])
y = train_data['Survived']
X_test = pd.get_dummies(test_data[features])
PassengerId = train_data['PassengerId']

imputer = SimpleImputer(strategy='mean')

X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
X_test = pd.DataFrame(imputer.fit_transform(X_test), columns=X_test.columns)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

model = RandomForestClassifier(n_estimators=100, random_state=1)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_val_scaled)
accuracy = accuracy_score(y_val, y_pred)
print('Accuracy:', accuracy)

X_test_scaled = scaler.transform(X_test)
predictions = model.predict(X_test_scaled)
predictions = (predictions > 0.5).astype(int)

output = pd.DataFrame({
    'PassengerId': test_data.PassengerId,
    'Survived': predictions
})

output.to_csv('prediction.csv', index=False)


Accuracy: 0.7653631284916201
