In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier, VotingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

# Load the dataset
file_path = '/kaggle/input/seed-generated/seedgenerated.csv'
data = pd.read_csv(file_path)

# Splitting the input (features) and output (label)
X = data['Comments'] + ' ' + data['Surrounding Code Context']  # Combining both input columns
y = data['Class']

# Convert the categorical target labels (useful/not useful) to numerical values
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Converts "Useful" to 1 and "Not Useful" to 0

# Convert text data into numerical feature vectors using TF-IDF with n-grams
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))  # Using unigrams and bigrams
X_vec = vectorizer.fit_transform(X)

# Handle class imbalance with SMOTE (oversampling the minority class)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_vec, y_encoded)

# Split the dataset into training and testing sets (80:20)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Function to calculate and print Precision, Recall, and F1-Score
def evaluate_model(model, X_test, y_test, model_name):
    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')
    
    print(f"\n{model_name} Model:")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")

# 1. Decision Tree Classifier (Adding regularization and hyperparameter tuning)
dt = DecisionTreeClassifier(max_depth=10, min_samples_split=5, min_samples_leaf=2)
dt.fit(X_train, y_train)
evaluate_model(dt, X_test, y_test, "Decision Tree")

# 2. k-Nearest Neighbors (k-NN) (Adding hyperparameter tuning)
knn = KNeighborsClassifier(n_neighbors=7, weights='distance')  # Using distance-weighted neighbors
knn.fit(X_train, y_train)
evaluate_model(knn, X_test, y_test, "k-Nearest Neighbors")

# 3. Naive Bayes Classifier (Applying Laplace smoothing)
nb = MultinomialNB(alpha=0.1)  # Laplace smoothing with alpha
nb.fit(X_train, y_train)
evaluate_model(nb, X_test, y_test, "Naive Bayes")

# 4. Gradient Boosting Classifier (Adding more estimators and limiting depth)
gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=4)
gb.fit(X_train, y_train)
evaluate_model(gb, X_test, y_test, "Gradient Boosting")

# 5. XGBoost Classifier (Adding regularization, depth control, and boosting rounds)
xgb_model = xgb.XGBClassifier(max_depth=6, n_estimators=300, learning_rate=0.05, reg_lambda=1.0)
xgb_model.fit(X_train, y_train)
evaluate_model(xgb_model, X_test, y_test, "XGBoost")

# 6. Random Forest Classifier (Adding more trees and regularization)
rf = RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=5)
rf.fit(X_train, y_train)
evaluate_model(rf, X_test, y_test, "Random Forest")

# 7. AdaBoost Classifier (Adding more estimators and adjusting learning rate)
ada = AdaBoostClassifier(n_estimators=200, learning_rate=0.05)
ada.fit(X_train, y_train)
evaluate_model(ada, X_test, y_test, "AdaBoost")

# 8. Support Vector Machine (SVM)
svm = SVC()
svm.fit(X_train, y_train)
evaluate_model(svm, X_test, y_test, "Support Vector Machine")

# # 8. Support Vector Machine (Adding regularization with grid search)
# param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
# svm = SVC()
# grid_search_svm = GridSearchCV(svm, param_grid, cv=5, scoring='f1')
# grid_search_svm.fit(X_train, y_train)
# best_svm = grid_search_svm.best_estimator_
# evaluate_model(best_svm, X_test, y_test, "Support Vector Machine")

# 9. Logistic Regression (Adding regularization with L2 penalty)
lr = LogisticRegression(C=1.0, penalty='l2', solver='lbfgs')
lr.fit(X_train, y_train)
evaluate_model(lr, X_test, y_test, "Logistic Regression")

# 10. LightGBM Classifier (Boosting with more trees and higher learning rate)
lgb_model = lgb.LGBMClassifier(n_estimators=300, learning_rate=0.05, max_depth=6)
lgb_model.fit(X_train, y_train)
evaluate_model(lgb_model, X_test, y_test, "LightGBM")

# 11. Voting Classifier (Combining multiple tuned models)
voting_clf = VotingClassifier(estimators=[('dt', dt), ('knn', knn), ('xgb', xgb_model)], voting='soft')  # Soft voting
voting_clf.fit(X_train, y_train)
evaluate_model(voting_clf, X_test, y_test, "Voting Classifier")



Decision Tree Model:
Precision: 0.7490
Recall: 0.7344
F1-Score: 0.7416

k-Nearest Neighbors Model:
Precision: 0.8385
Recall: 0.4011
F1-Score: 0.5426

Naive Bayes Model:
Precision: 0.7550
Recall: 0.8433
F1-Score: 0.7967

Gradient Boosting Model:
Precision: 0.7816
Recall: 0.7660
F1-Score: 0.7737

XGBoost Model:
Precision: 0.8033
Recall: 0.8026
F1-Score: 0.8030

Random Forest Model:
Precision: 0.8089
Recall: 0.6748
F1-Score: 0.7358

AdaBoost Model:
Precision: 0.7391
Recall: 0.6373
F1-Score: 0.6845

Support Vector Machine Model:
Precision: 0.8270
Recall: 0.8898
F1-Score: 0.8573

Logistic Regression Model:
Precision: 0.8167
Recall: 0.8211
F1-Score: 0.8189
[LightGBM] [Info] Number of positive: 5103, number of negative: 5140
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.230562 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 59420
[LightGBM]

In [None]:
!pip install xgboost