In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
import tqdm as notebook_tqdm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from datasets import load_dataset
import nltk

#nltk.download('stopwords')
#nltk.download('wordnet')

ds = load_dataset('NLBSE/nlbse25-code-comment-classification')

langs = ['java', 'python', 'pharo']
labels = {
    'java': ['summary', 'Ownership', 'Expand', 'usage', 'Pointer', 'deprecation', 'rational'],
    'python': ['Usage', 'Parameters', 'DevelopmentNotes', 'Expand', 'Summary'],
    'pharo': ['Keyimplementationpoints', 'Example', 'Responsibilities', 'Classreferences', 'Intent', 'Keymessages', 'Collaborators']
}

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_data(data):
    df = pd.DataFrame(data)
    cleaned_features = []

    for _, row in df.iterrows():
        combined_text = f"{row['comment_sentence']} {row['class']}"
        words = combined_text.split()
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
        filtered_words = [word for word in lemmatized_words if word.lower() not in stop_words]
        cleaned_features.append(' '.join(filtered_words))

    target = np.array(df['labels'].tolist())
    return cleaned_features, target

results = []

param_grid = {
    'n_estimators': [100, 200, 300],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

for lang in langs:
    print(f"Training for language: {lang}")

    train_features, train_labels = preprocess_data(ds[f'{lang}_train'])
    test_features, test_labels = preprocess_data(ds[f'{lang}_test'])

    vectorizer = TfidfVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(train_features)
    X_test = vectorizer.transform(test_features)

    rf = RandomForestClassifier()

    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='accuracy')
    grid_search.fit(X_train, train_labels)
    print(f"Best hyperparameters for {lang}: {grid_search.best_params_}")

    best_rf = grid_search.best_estimator_
    y_pred = best_rf.predict(X_test)

    for i, label in enumerate(labels[lang]):
        precision = precision_score(test_labels[:, i], y_pred[:, i], zero_division=0)
        recall = recall_score(test_labels[:, i], y_pred[:, i], zero_division=0)
        f1 = f1_score(test_labels[:, i], y_pred[:, i], zero_division=0)
        results.append({
            'language': lang,
            'category': label,
            'precision': round(precision, 4),
            'recall': round(recall, 4),
            'f1': round(f1, 4)
        })

results_df = pd.DataFrame(results)

print("Performance by category and language:")
print(results_df)


Training for language: java
Fitting 3 folds for each of 54 candidates, totalling 162 fits
Best hyperparameters for java: {'bootstrap': True, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Training for language: python
Fitting 3 folds for each of 54 candidates, totalling 162 fits
Best hyperparameters for python: {'bootstrap': True, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Training for language: pharo
Fitting 3 folds for each of 54 candidates, totalling 162 fits
Best hyperparameters for pharo: {'bootstrap': True, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Performance by category and language:
   language                 category  precision  recall      f1
0      java                  summary     0.8263  0.8105  0.8183
1      java                Ownership     0.9565  0.9778  0.9670
2      java                   Expand     0.8824  0.1471  0.2521
3      java                    usage     0.8734  0.7680  0.8173
4      ja