In [6]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import pandas as pd

In [2]:
# Load datasets
processed_df = pd.read_csv('../data/processed_train.csv')
processed_eval = pd.read_csv('../data/processed_eval.csv')
processed_test = pd.read_csv('../data/processed_test.csv')

In [3]:
# Separate features and labels for train, eval, and test
X_train = processed_df['combined_text']
y_train = processed_df['label']
X_eval = processed_eval['combined_text']
y_eval = processed_eval['label']
X_test = processed_test['combined_text']
y_test = processed_test['label']

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score, classification_report

# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1, 2), sublinear_tf=True)),
    ('select', SelectKBest(chi2, k=10000)),  # Feature selection
    ('clf', RandomForestClassifier(random_state=42, class_weight='balanced'))
])

# Parameter grid for Random Forest
param_grid = {
    'tfidf__max_features': [10000, 20000],
    'clf__n_estimators': [100, 200],  # Number of trees
    'clf__max_depth': [None, 10, 20],  # Depth of each tree
    'clf__min_samples_split': [2, 5],  # Minimum samples to split a node
}

# Perform GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Evaluate the best model
best_model = grid_search.best_estimator_
y_eval_pred = best_model.predict(X_eval)

# Output evaluation results
print("Best Parameters:", grid_search.best_params_)
print("Evaluation Accuracy:", accuracy_score(y_eval, y_eval_pred))
print("\nClassification Report:\n", classification_report(y_eval, y_eval_pred))


Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100, tfidf__max_features=20000; total time=   9.2s
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100, tfidf__max_features=20000; total time=   9.6s
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100, tfidf__max_features=20000; total time=   9.7s
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100, tfidf__max_features=20000; total time=   9.7s
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100, tfidf__max_features=20000; total time=   9.6s
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100, tfidf__max_features=10000; total time=  12.6s
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100, tfidf__max_features=10000; total time=  12.9s
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_