In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier, RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, ElasticNet
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

# Load the dataset
file_path = '/kaggle/input/abcdefghi/Seed dataset.csv'
data = pd.read_csv(file_path)

# Splitting the input (features) and output (label)
X = data['Comments'] + ' ' + data['Surrounding Code Context']  # Combining both input columns
y = data['Class']

In [2]:


# Convert the categorical target labels (useful/not useful) to numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Converts "Useful" to 1 and "Not Useful" to 0

# Convert text data into numerical feature vectors using TF-IDF with n-grams
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Using unigrams and bigrams
X_vec = vectorizer.fit_transform(X)

# Handle class imbalance with SMOTE (oversampling the minority class)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_vec, y_encoded)

# Split the dataset into training and testing sets (80:20)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Function to calculate and print Precision, Recall, and F1-Score
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')
    
    print(f"\n{model_name} Model:")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")

In [3]:
# 1. CatBoost Classifier (Fast gradient boosting)
cat_model = cb.CatBoostClassifier(iterations=500, depth=6, learning_rate=0.1, verbose=0)
cat_model.fit(X_train, y_train)
evaluate_model(cat_model, X_test, y_test, "CatBoost")

# 2. ExtraTrees Classifier (Randomized trees, similar to RandomForest but with more variance reduction)
extra_trees = ExtraTreesClassifier(n_estimators=200, max_depth=10, min_samples_split=5, random_state=42)
extra_trees.fit(X_train, y_train)
evaluate_model(extra_trees, X_test, y_test, "ExtraTrees")

# 3. ElasticNet (Combination of Lasso and Ridge regularization)
elastic_net = ElasticNet(alpha=0.01, l1_ratio=0.7)
elastic_net.fit(X_train, y_train)
y_pred_elastic = elastic_net.predict(X_test)
# ElasticNet does not give classification outputs directly, so let's use a threshold to convert to classes
y_pred_elastic_class = [1 if pred > 0.5 else 0 for pred in y_pred_elastic]
precision = precision_score(y_test, y_pred_elastic_class, average='binary')
recall = recall_score(y_test, y_pred_elastic_class, average='binary')
f1 = f1_score(y_test, y_pred_elastic_class, average='binary')
print(f"\nElasticNet Model:\nPrecision: {precision:.4f}\nRecall: {recall:.4f}\nF1-Score: {f1:.4f}")

# 4. Stacking Classifier (Combining multiple models in a meta-learner)
estimators = [('rf', RandomForestClassifier(n_estimators=100)),
              ('gb', GradientBoostingClassifier(n_estimators=100))]
stacking = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
stacking.fit(X_train, y_train)
evaluate_model(stacking, X_test, y_test, "Stacking Classifier")

# 5. Neural Network (Multi-Layer Perceptron - MLP)
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, solver='adam', learning_rate='adaptive', random_state=42)
mlp.fit(X_train, y_train)
evaluate_model(mlp, X_test, y_test, "Neural Network (MLP)")



CatBoost Model:
Precision: 0.8310
Recall: 0.8082
F1-Score: 0.8194

ExtraTrees Model:
Precision: 0.8121
Recall: 0.6723
F1-Score: 0.7356

ElasticNet Model:
Precision: 0.7457
Recall: 0.5009
F1-Score: 0.5993

Stacking Classifier Model:
Precision: 0.8117
Recall: 0.8366
F1-Score: 0.8239

Neural Network (MLP) Model:
Precision: 0.8365
Recall: 0.7910
F1-Score: 0.8131
