<a href="https://colab.research.google.com/github/thatswhatmeetcoded/Sentiment-Classification/blob/main/decision_tree/7_decision_tree_hyperparameter_grid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from scipy import sparse
import itertools

# Load data
from google.colab import drive
drive.mount('/content/drive')

# Load labels
y = np.load('/content/drive/MyDrive/features/y.npy')

# Load features
X_tfidf = sparse.load_npz('/content/drive/MyDrive/features/X_tfidf.npz')
X_bow = sparse.load_npz('/content/drive/MyDrive/features/X_bow.npz')

# Train-test split
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
X_train_bow, X_test_bow, _, _ = train_test_split(X_bow, y, test_size=0.2, random_state=42)

# Hyperparameter grid
max_depths = [5, 10, 15, 20, None]
min_samples_splits = [2, 5, 10]
min_samples_leafs = [1, 2, 4]
criterions = ['gini', 'entropy']

# Logging results
results = []

# Iterate through all combinations
for criterion, max_depth, min_samples_split, min_samples_leaf in itertools.product(
    criterions, max_depths, min_samples_splits, min_samples_leafs
):
    clf = DecisionTreeClassifier(
        criterion=criterion,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )

    # Train on BoW
    clf.fit(X_train_bow, y_train)
    y_pred_bow = clf.predict(X_test_bow)
    acc_bow = accuracy_score(y_test, y_pred_bow)

    # Train on TF-IDF
    clf.fit(X_train_tfidf, y_train)
    y_pred_tfidf = clf.predict(X_test_tfidf)
    acc_tfidf = accuracy_score(y_test, y_pred_tfidf)

    results.append({
        'criterion': criterion,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'min_samples_leaf': min_samples_leaf,
        'accuracy_bow': acc_bow,
        'accuracy_tfidf': acc_tfidf
    })

# Save to CSV
results_df = pd.DataFrame(results)
print(results_df)
import os

# Create directory if it doesn't exist
results_dir = '/content/drive/MyDrive/hyperparam_results'
os.makedirs(results_dir, exist_ok=True)

# Save results
results_df.to_csv(f'{results_dir}/decision_tree_comparison.csv', index=False)
print("Hyperparameter tuning complete! Results saved to:", results_dir)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
   criterion  max_depth  min_samples_split  min_samples_leaf  accuracy_bow  \
0       gini        5.0                  2                 1      0.519231   
1       gini        5.0                  2                 2      0.509615   
2       gini        5.0                  2                 4      0.519231   
3       gini        5.0                  5                 1      0.519231   
4       gini        5.0                  5                 2      0.509615   
..       ...        ...                ...               ...           ...   
85   entropy        NaN                  5                 2      0.663462   
86   entropy        NaN                  5                 4      0.615385   
87   entropy        NaN                 10                 1      0.653846   
88   entropy        NaN                 10                 2      0.653846   
89   entropy 