# Step 1: Create a Sample Dataset
Create a synthetic dataset with some categorical and numerical columns and save it as a CSV file.

In [None]:
import pandas as pd
import numpy as np

# Set seed for reproducibility
np.random.seed(42)

# Create a sample dataset
data = {
    'Category': np.random.choice(['A', 'B', 'C', 'D'], size=100),
    'Subcategory': np.random.choice(['X', 'Y', 'Z'], size=100),
    'NumericalFeature1': np.random.randn(100),
    'NumericalFeature2': np.random.rand(100) * 100,
    'Target': np.random.choice([0, 1], size=100)
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Save DataFrame to CSV
df.to_csv('sample_dataset.csv', index=False)

# Display the first few rows of the dataset
print(df.head())


# Step 2: Load and Explore the Dataset
Now, assuming  uploaded sample_dataset.csv to  Kaggle environment, let's load and explore the dataset.


In [None]:
# Import necessary libraries
import pandas as pd

# Load dataset
data = pd.read_csv('/kaggle/input/sample-dataset-csv/sample_dataset.csv')

# Display the first few rows of the dataset
print(data.head())

# Identify categorical columns
categorical_columns = ['Category', 'Subcategory']

# Display unique values and their counts for each categorical column
for col in categorical_columns:
    print(f"Column: {col}")
    print(data[col].value_counts())
    print("\n")


# Step 3: Preprocessing - One-Hot Encoding and Label Encoding
After loading and exploring the dataset, the next step involves preprocessing the categorical variables using one-hot encoding and label encoding.

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# One-hot encoding
onehot_encoder = OneHotEncoder(sparse_output=False)  # Adjusted argument name
onehot_encoded = onehot_encoder.fit_transform(data[['Category', 'Subcategory']])

# Create a DataFrame with one-hot encoded columns
onehot_encoded_df = pd.DataFrame(onehot_encoded, columns=onehot_encoder.get_feature_names_out(['Category', 'Subcategory']))
print("One-Hot Encoded DataFrame:")
print(onehot_encoded_df.head())

# Label encoding
label_encoder = LabelEncoder()
data['Category_LabelEncoded'] = label_encoder.fit_transform(data['Category'])
data['Subcategory_LabelEncoded'] = label_encoder.fit_transform(data['Subcategory'])

print("\nLabel Encoded Columns:")
print(data[['Category', 'Category_LabelEncoded', 'Subcategory', 'Subcategory_LabelEncoded']].head())


# Explanation:
**OneHotEncode**r: 
Adjusted the argument sparse to sparse_output which is compatible with the latest version of sklearn.
**get_feature_names_ou**t: 
Ensures that the column names are correctly extracted after one-hot encoding.

# Step 4: Building and Evaluating Decision Tree Models
In this step, we will:

* Split the data into training and testing sets.
* Train decision tree models using one-hot encoded data and label encoded data.
* Evaluate and compare the models' performance.

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Define features and target
X_onehot = onehot_encoded_df
X_label = data[['Category_LabelEncoded', 'Subcategory_LabelEncoded', 'NumericalFeature1', 'NumericalFeature2']]
y = data['Target']

# Split data into training and testing sets
X_train_onehot, X_test_onehot, y_train, y_test = train_test_split(X_onehot, y, test_size=0.3, random_state=42)
X_train_label, X_test_label, y_train, y_test = train_test_split(X_label, y, test_size=0.3, random_state=42)

# Train decision tree classifier on one-hot encoded data
clf_onehot = DecisionTreeClassifier(random_state=42)
clf_onehot.fit(X_train_onehot, y_train)

# Train decision tree classifier on label encoded data
clf_label = DecisionTreeClassifier(random_state=42)
clf_label.fit(X_train_label, y_train)

# Make predictions
predictions_onehot = clf_onehot.predict(X_test_onehot)
predictions_label = clf_label.predict(X_test_label)

# Evaluate models
accuracy_onehot = accuracy_score(y_test, predictions_onehot)
accuracy_label = accuracy_score(y_test, predictions_label)

print(f'Accuracy with One-Hot Encoding: {accuracy_onehot:.4f}')
print(f'Accuracy with Label Encoding: {accuracy_label:.4f}')


# Explanation:
**Splitting the Data**: The data is split into training and testing sets using train_test_split.
**Training the Models**: 
Two decision tree models are trained, one on the one-hot encoded data and the other on the label encoded data.
**Making Predictions:** 
The models make predictions on the testing set.
Evaluating the Models: The accuracy of each model is calculated and displayed.

# Step 5: Model Tuning and Further Evaluation

To potentially improve model performance, we can:

Tune the hyperparameters of the decision tree models.
Evaluate the models using cross-validation.
Analyze feature importance to understand the impact of categorical features.

# 5.1 Hyperparameter Tuning
We can use GridSearchCV to find the best hyperparameters for our decision tree models.

In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid search for one-hot encoded data
grid_search_onehot = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, n_jobs=-1)
grid_search_onehot.fit(X_train_onehot, y_train)
best_onehot = grid_search_onehot.best_estimator_

# Grid search for label encoded data
grid_search_label = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, n_jobs=-1)
grid_search_label.fit(X_train_label, y_train)
best_label = grid_search_label.best_estimator_

# Evaluate tuned models
predictions_best_onehot = best_onehot.predict(X_test_onehot)
predictions_best_label = best_label.predict(X_test_label)

accuracy_best_onehot = accuracy_score(y_test, predictions_best_onehot)
accuracy_best_label = accuracy_score(y_test, predictions_best_label)

print(f'Best Accuracy with One-Hot Encoding: {accuracy_best_onehot:.4f}')
print(f'Best Accuracy with Label Encoding: {accuracy_best_label:.4f}')


# 5.2 Cross-Validation
To get a more robust estimate of model performance, use cross-validation.

In [None]:
from sklearn.model_selection import cross_val_score

# Cross-validation for one-hot encoded data
cv_scores_onehot = cross_val_score(best_onehot, X_onehot, y, cv=5)
cv_mean_onehot = cv_scores_onehot.mean()

# Cross-validation for label encoded data
cv_scores_label = cross_val_score(best_label, X_label, y, cv=5)
cv_mean_label = cv_scores_label.mean()

print(f'Cross-Validation Accuracy with One-Hot Encoding: {cv_mean_onehot:.4f}')
print(f'Cross-Validation Accuracy with Label Encoding: {cv_mean_label:.4f}')


# 5.3 Feature Importance Analysis
Understanding which features are most important can provide insights into the data and model behavior.

In [None]:
import matplotlib.pyplot as plt

# Feature importance for one-hot encoded data
feature_importances_onehot = best_onehot.feature_importances_
feature_names_onehot = onehot_encoder.get_feature_names_out(['Category', 'Subcategory'])  # Only pass the categorical features used for one-hot encoding
plt.figure(figsize=(10, 5))
plt.barh(feature_names_onehot, feature_importances_onehot)
plt.title('Feature Importance (One-Hot Encoding)')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()

# Feature importance for label encoded data
feature_importances_label = best_label.feature_importances_
feature_names_label = ['Category_LabelEncoded', 'Subcategory_LabelEncoded', 'NumericalFeature1', 'NumericalFeature2']
plt.figure(figsize=(10, 5))
plt.barh(feature_names_label, feature_importances_label)
plt.title('Feature Importance (Label Encoding)')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Feature importance for one-hot encoded data
feature_importances_onehot = best_onehot.feature_importances_
feature_names_onehot = onehot_encoder.get_feature_names_out(['Category', 'Subcategory'])  # Only pass the categorical features used for one-hot encoding
plt.figure(figsize=(10, 5))
plt.barh(feature_names_onehot, feature_importances_onehot)
plt.title('Feature Importance (One-Hot Encoding)')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.savefig('/kaggle/working/feature_importance_onehot.png')
plt.show()

# Feature importance for label encoded data
feature_importances_label = best_label.feature_importances_
feature_names_label = ['Category_LabelEncoded', 'Subcategory_LabelEncoded', 'NumericalFeature1', 'NumericalFeature2']
plt.figure(figsize=(10, 5))
plt.barh(feature_names_label, feature_importances_label)
plt.title('Feature Importance (Label Encoding)')
plt.xlabel('Importance')
plt.ylabel('Features')
plt.savefig('/kaggle/working/feature_importance_label.png')
plt.show()


# Summary of Findings:
# Model Performance:

Initial accuracy with one-hot encoding: 0.3667
Initial accuracy with label encoding: 0.4667
Label encoding outperformed one-hot encoding in this dataset.
Next Steps:

Tuned hyperparameters using GridSearchCV.
Best model cross-validated accuracy:
One-Hot Encoding: [Insert Value]
Label Encoding: [Insert Value]
Feature Importance:

Visualized feature importance for both encoding strategies.
Provided insights into the impact of features on the models.
Conclusion:
Encoding Strategy:

Choice of encoding strategy influences model performance.
Label encoding performed better initially but further tuning may improve one-hot encoding.
Future Work:

Experiment with other models and advanced encoding techniques.
Explore feature engineering and model interpretability.
