In [3]:
import pandas as pd
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
import optuna

dataset_path = '../resources/dataset.csv'

df = pd.read_csv(dataset_path)

df.head()


Unnamed: 0,text,category
0,نظم عهد شرق لفن عرض فنا تحت عنو بقة الف وذل سع...,Culture
1,تقم فنن ليت كابيلو عرض طلع عام دبي يضم عرض لوح...,Culture
2,وصل يلة سير تحد تعة ءثر نفس يرق لقب شعر ملي نس...,Culture
3,عقد ظهر ءمس ءول قصر ثقف شرق جلس ءخر جلس لقى ءو...,Culture
4,خار صحف يمز جورج ءورويل يحل رتب قءم تضم ءعظم خ...,Culture


In [2]:
# Extract text and category columns
texts = df['text'].tolist()
categories = df['category'].tolist()

# Initialize Vectorizer and generate embeddings
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)

# Encode categories
y = pd.factorize(categories)[0]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


  y = pd.factorize(categories)[0]


In [None]:
# Define objective function for Optuna
def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 3, 5),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 5),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 0.2),
        'eta': trial.suggest_float('eta', 0.1, 0.3),
        'objective': 'binary:logistic',  # Define objective for binary classification
        'eval_metric': 'logloss'
    }

    # Split train and validation data
    X_train_sub, X_val, y_train_sub, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    # Train the model
    model = xgb.XGBClassifier(**param, use_label_encoder=False)
    model.fit(X_train_sub, y_train_sub)

    # Make predictions
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred, average='weighted')

    return f1


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50) 

# Best parameters and model
best_params = study.best_params
best_model = xgb.XGBClassifier(**best_params)
best_model.fit(X_train, y_train)

print("Best Parameters: ", best_params)


[I 2024-11-06 15:00:09,515] A new study created in memory with name: no-name-ea09d345-1306-4562-a78c-a5b650c183fb
Parameters: { "use_label_encoder" } are not used.

[I 2024-11-06 15:03:28,819] Trial 0 finished with value: 0.9675722680816211 and parameters: {'max_depth': 3, 'min_child_weight': 1, 'subsample': 0.919086438707354, 'colsample_bytree': 0.8881990996093398, 'gamma': 0.0009328590460168763, 'eta': 0.20904590361606862}. Best is trial 0 with value: 0.9675722680816211.
Parameters: { "use_label_encoder" } are not used.

[I 2024-11-06 15:06:28,501] Trial 1 finished with value: 0.9674384775056308 and parameters: {'max_depth': 3, 'min_child_weight': 2, 'subsample': 0.8265301454456171, 'colsample_bytree': 0.7508522017066668, 'gamma': 0.043615127544943594, 'eta': 0.2180734627599386}. Best is trial 0 with value: 0.9675722680816211.
Parameters: { "use_label_encoder" } are not used.

[I 2024-11-06 15:10:00,976] Trial 2 finished with value: 0.9704598587613389 and parameters: {'max_depth': 4,

In [None]:
# Make predictions
y_pred = best_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Display evaluation metrics
print("Model Evaluation Metrics:")
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1)


In [None]:
# Define the directory path
model_dir = 'Backend/Models/XGBoost'

# Create directory if it doesn't exist
os.makedirs(model_dir, exist_ok=True)

# Save the best model and vectorizer
joblib.dump(best_model, os.path.join(model_dir, 'XGBoost_model.pkl'))
joblib.dump(vectorizer, os.path.join(model_dir, 'XGBoost_tfidf_vectorizer.pkl'))

print("Model and vectorizer saved successfully.")