# K-Fold Cross Validation

### Import necessary libraries

In [1]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

### Load the Titanic dataset from your local PC

In [3]:
titanic_data = pd.read_csv('titanic-2.csv')

### Handling Missing Values

In [4]:
# For simplicity, we'll fill missing age values with the mean age and 
# missing embarked values with the most frequent port.

In [5]:
# Fill missing ages with the mean age

age_imputer = SimpleImputer(strategy='mean')
titanic_data['Age'] = age_imputer.fit_transform(titanic_data[['Age']])

In [6]:
# Convert 'Sex' and 'Embarked' to numerical values

label_encoder = LabelEncoder()
titanic_data['Sex'] = label_encoder.fit_transform(titanic_data['Sex'])
titanic_data['Embarked'] = label_encoder.fit_transform(titanic_data['Embarked'])


In [7]:
# Fill missing embarked values with the most frequent port

embarked_imputer = SimpleImputer(strategy='most_frequent')
titanic_data['Embarked'] = embarked_imputer.fit_transform(titanic_data[['Embarked']])

In [8]:
# Drop 'Name,' 'Ticket,' and 'Cabin' columns

titanic_data = titanic_data.drop(['Name', 'Ticket', 'Cabin'], axis=1)

In [9]:
# Define features (X) and target (y)

X = titanic_data.drop('Survived', axis=1)  # Assuming 'Survived' is the target variable
y = titanic_data['Survived']


In [10]:
# Create a Decision Tree classifier

clf = DecisionTreeClassifier(random_state=42)


In [11]:
# Perform K-Fold Cross-Validation (e.g., K=5)

k = 5
cv_scores = cross_val_score(clf, X, y, cv=k)

### Interpretation of Results

In [12]:
print(f"Cross-Validation Scores (K={k}): {cv_scores}")
print(f"Mean Accuracy: {cv_scores.mean():.2f}")
print(f"Standard Deviation: {cv_scores.std():.2f}")

Cross-Validation Scores (K=5): [0.61452514 0.7752809  0.82022472 0.75842697 0.83707865]
Mean Accuracy: 0.76
Standard Deviation: 0.08
