In [4]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline

# تحميل الموارد المطلوبة
nltk.download('punkt')
nltk.download('stopwords')

# إعداد الستيمينغ والكلمات الشائعة
stemmer = nltk.stem.PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # إزالة الأحرف غير المرغوب فيها
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # تحويل النص إلى حروف صغيرة
    text = text.lower()
    # تقسيم النص إلى كلمات
    tokens = word_tokenize(text)
    # إزالة الكلمات الشائعة
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # تطبيق الستيمينغ
    stemmed_words = [stemmer.stem(word) for word in filtered_tokens]
    return ' '.join(stemmed_words)

# قراءة البيانات من ملف CSV
file_path = r'C:\Users\hp\Desktop\bbc-text.csv'  # تأكد من تعديل المسار حسب الحاجة
data = pd.read_csv(file_path)

# تنظيف النصوص
data['final_result'] = data['text'].apply(clean_text)

# إعداد المتغيرات للنموذج
X = data['final_result']
y = data['category']

# تقسيم البيانات إلى مجموعة تدريب ومجموعة اختبار
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# إعداد TF-IDF وBag of Words
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
bow_vectorizer = CountVectorizer(max_features=1000)

# تحويل البيانات باستخدام TF-IDF
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# تحويل البيانات باستخدام Bag of Words
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

# استخدام SMOTE لزيادة البيانات في مجموعة التدريب
smote = SMOTE()
X_train_resampled_tfidf, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)
X_train_resampled_bow, y_train_resampled_bow = smote.fit_resample(X_train_bow, y_train)

# إعداد النماذج
models = {
    'SVM': SVC(),
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(random_state=42)
}

# تدريب النماذج وتقييمها باستخدام TF-IDF
for model_name, model in models.items():
    pipeline_tfidf = Pipeline([
        ('classifier', model)
    ])
    pipeline_tfidf.fit(X_train_resampled_tfidf, y_train_resampled)

    predictions_tfidf = pipeline_tfidf.predict(X_test_tfidf)
    accuracy_tfidf = accuracy_score(y_test, predictions_tfidf)
    print(f"\n{model_name} (TF-IDF) Accuracy: {accuracy_tfidf:.4f}")
    print(f"\n{model_name} (TF-IDF) Classification Report:\n", classification_report(y_test, predictions_tfidf))

# تدريب النماذج وتقييمها باستخدام Bag of Words
for model_name, model in models.items():
    pipeline_bow = Pipeline([
        ('classifier', model)
    ])
    pipeline_bow.fit(X_train_resampled_bow, y_train_resampled_bow)

    predictions_bow = pipeline_bow.predict(X_test_bow)
    accuracy_bow = accuracy_score(y_test, predictions_bow)
    print(f"\n{model_name} (Bag of Words) Accuracy: {accuracy_bow:.4f}")
    print(f"\n{model_name} (Bag of Words) Classification Report:\n", classification_report(y_test, predictions_bow))

# عرض أحجام المجموعات
print("حجم مجموعة التدريب:", X_train.shape[0])
print("حجم مجموعة الاختبار:", X_test.shape[0])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!



SVM (TF-IDF) Accuracy: 0.9798

SVM (TF-IDF) Classification Report:
                precision    recall  f1-score   support

     business       0.98      0.97      0.98       102
entertainment       0.96      1.00      0.98        77
     politics       0.98      0.96      0.97        84
        sport       1.00      1.00      1.00       102
         tech       0.97      0.96      0.97        80

     accuracy                           0.98       445
    macro avg       0.98      0.98      0.98       445
 weighted avg       0.98      0.98      0.98       445


Naive Bayes (TF-IDF) Accuracy: 0.9820

Naive Bayes (TF-IDF) Classification Report:
                precision    recall  f1-score   support

     business       0.99      0.97      0.98       102
entertainment       0.97      1.00      0.99        77
     politics       0.98      0.95      0.96        84
        sport       0.99      1.00      1.00       102
         tech       0.98      0.99      0.98        80

     accuracy   