Binisha Shakya (2407792)

1. Implement Classification Models:

• Train a Decision Tree Classifier and a Random Forest Classifier using scikit-learn.

• Compare the models based on their F1 scores.

In [None]:
# Import necessary libraries
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report

# Load the Wine dataset
data = load_wine()
X, y = data.data, data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Initialize and train the Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Predict with the Decision Tree Classifier
dt_predictions = dt_model.predict(X_test)

# Initialize and train the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)

# Predict with the Random Forest Classifier
rf_predictions = rf_model.predict(X_test)

# Calculate F1 scores for both models
dt_f1 = f1_score(y_test, dt_predictions, average='weighted')
rf_f1 = f1_score(y_test, rf_predictions, average='weighted')

# Print the results
print("Decision Tree Classifier F1 Score:", dt_f1)
print("Random Forest Classifier F1 Score:", rf_f1)

# Print detailed classification reports
print("\nDecision Tree Classification Report:\n", classification_report(y_test, dt_predictions))
print("Random Forest Classification Report:\n", classification_report(y_test, rf_predictions))


Decision Tree Classifier F1 Score: 0.9632208787381201
Random Forest Classifier F1 Score: 1.0

Decision Tree Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.94      0.97        18
           1       0.91      1.00      0.95        21
           2       1.00      0.93      0.97        15

    accuracy                           0.96        54
   macro avg       0.97      0.96      0.96        54
weighted avg       0.97      0.96      0.96        54

Random Forest Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        18
           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00        15

    accuracy                           1.00        54
   macro avg       1.00      1.00      1.00        54
weighted avg       1.00      1.00      1.00        54



2. Hyperparameter Tuning:

• Identify three hyperparameters of the Random Forest Classifier.

• Perform hyperparameter tuning using GridSearchCV to optimize these parameters.

• Take hints from the scikit-learn documentation to guide the implementation.

In [None]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.datasets import load_wine
from sklearn.metrics import f1_score, classification_report

# Load the Wine dataset
data = load_wine()
X, y = data.data, data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100, 200],      # Number of trees
    'max_depth': [None, 10, 20, 30],    # Maximum depth of trees
    'min_samples_split': [2, 5, 10]     # Minimum samples required to split a node
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and the best estimator
best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

# Evaluate the optimized model on the test set
y_pred = best_rf_model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='weighted')

# Output the results
print("Best Parameters:", best_params)
print("F1 Score of Optimized Model:", f1)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}
F1 Score of Optimized Model: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        18
           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00        15

    accuracy                           1.00        54
   macro avg       1.00      1.00      1.00        54
weighted avg       1.00      1.00      1.00        54



3. Implement Regression Model:

• Train a Decision Tree Regressor and a Random Forest Regressor using scikit-learn.

• Identify three parameters for Random Forest Regressio and Perform hyperparameter tuning using RandomSearchCV to optimize these parameters.

In [None]:
# Import necessary libraries
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load the Diabetes dataset (you can replace this with your dataset)
data = load_diabetes()
X, y = data.data, data.target

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a Decision Tree Regressor
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)

# Predict with the Decision Tree Regressor
dt_predictions = dt_model.predict(X_test)

# Train a Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train, y_train)

# Predict with the Random Forest Regressor
rf_predictions = rf_model.predict(X_test)

# Evaluate the models
dt_mse = mean_squared_error(y_test, dt_predictions)
rf_mse = mean_squared_error(y_test, rf_predictions)

print("Decision Tree Regressor MSE:", dt_mse)
print("Random Forest Regressor MSE:", rf_mse)

# Perform hyperparameter tuning using RandomizedSearchCV for Random Forest Regressor
param_distributions = {
    'n_estimators': [50, 100, 200, 300],           # Number of trees
    'max_depth': [None, 10, 20, 30, 40],           # Maximum depth of the trees
    'min_samples_split': [2, 5, 10, 15, 20]        # Minimum samples required to split a node
}

random_search = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_distributions=param_distributions,
    n_iter=50,            # Number of random combinations to try
    cv=5,                 # Cross-validation folds
    scoring='neg_mean_squared_error',  # Metric for optimization
    n_jobs=-1,            # Use all available processors
    random_state=42
)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get the best parameters and evaluate the optimized model
best_params = random_search.best_params_
best_rf_model = random_search.best_estimator_

# Predict with the optimized model
optimized_predictions = best_rf_model.predict(X_test)
optimized_mse = mean_squared_error(y_test, optimized_predictions)

print("\nBest Parameters from RandomizedSearchCV:", best_params)
print("Optimized Random Forest Regressor MSE:", optimized_mse)
print("Optimized Random Forest Regressor R^2 Score:", r2_score(y_test, optimized_predictions))


Decision Tree Regressor MSE: 5697.789473684211
Random Forest Regressor MSE: 2859.641982706767

Best Parameters from RandomizedSearchCV: {'n_estimators': 100, 'min_samples_split': 20, 'max_depth': 30}
Optimized Random Forest Regressor MSE: 2744.6365788651296
Optimized Random Forest Regressor R^2 Score: 0.49157463784423416
