In [2]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report



In [3]:
# Step 1: Loading the news article dataset
file_path = 'news-article-categories clean.csv'
data = pd.read_csv(file_path)

# Preview the dataset
print("Dataset Preview:")
print(data.head())

# Combine 'title' and 'body' columns into a single 'text' column
data['text'] = data['title'] + " " + data['body']
data = data.dropna(subset=['text', 'category'])  # Drop rows with missing values

Dataset Preview:
         category                                              title  \
0  ARTS & CULTURE  Modeling Agencies Enabled Sexual Predators For...   
1  ARTS & CULTURE  Actor Jeff Hiller Talks “Bright Colors And Bol...   
2  ARTS & CULTURE  New Yorker Cover Puts Trump 'In The Hole' Afte...   
3  ARTS & CULTURE  Man Surprises Girlfriend By Drawing Them In Di...   
4  ARTS & CULTURE  This Artist Gives Renaissance-Style Sculptures...   

                                                body Unnamed: 3 Unnamed: 4  \
0  In October 2017, Carolyn Kramer received a dis...        NaN        NaN   
1  This week I talked with actor Jeff Hiller abou...        NaN        NaN   
2  The New Yorker is taking on President Donald T...        NaN        NaN   
3  Kellen Hickey, a 26-year-old who lives in Huds...        NaN        NaN   
4  There’s something about combining the traditio...        NaN        NaN   

  Unnamed: 5 Unnamed: 6 Unnamed: 7 Unnamed: 8 Unnamed: 9  ... Unnamed: 315  \
0  

  data = pd.read_csv(file_path)


In [4]:
# Step 2: Divide the data into training and testing sets
X = data['text']  
y= data['category']  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Convert the text data using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [None]:
# Step 3: making the first SVM model with default settings
initial_model = SVC(kernel='linear', random_state=42)  # Default parameters

# Perform 5-fold cross-validation
cv_scores_initial = cross_val_score(initial_model, X_train_tfidf, y_train, cv=5, scoring='accuracy')

# Print cross-validation results for the initial model
print(f"Cross-Validation Accuracy Scores (Initial Model): {cv_scores_initial}")
print(f"Mean Accuracy (Initial Model): {cv_scores_initial.mean():.4f}")
print(f"Standard Deviation (Initial Model): {cv_scores_initial.std():.4f}")

# Train the initial model on the full training data
initial_model.fit(X_train_tfidf, y_train)

# Test the initial model using the test data
y_pred_initial = initial_model.predict(X_test_tfidf)

# Calculate the performance measures for the initial model
accuracy_initial = accuracy_score(y_test, y_pred_initial)
precision_initial = precision_score(y_test, y_pred_initial, average='weighted')
recall_initial = recall_score(y_test, y_pred_initial, average='weighted')
f1_initial = f1_score(y_test, y_pred_initial, average='weighted')

# Display the performance measures for the initial model
print("\nInitial Model Evaluation Metrics:")
print(f"Accuracy: {accuracy_initial * 100:.2f}%")
print(f"Precision: {precision_initial * 100:.2f}%")
print(f"Recall: {recall_initial * 100:.2f}%")
print(f"F1 Score: {f1_initial * 100:.2f}%")

# Print classification report for the initial model
print("\nClassification Report (Initial Model):\n", classification_report(y_test, y_pred_initial))


Cross-Validation Accuracy Scores (Initial Model): [0.80454545 0.80363636 0.77707006 0.78798908 0.80345769]
Mean Accuracy (Initial Model): 0.7953
Standard Deviation (Initial Model): 0.0110

Initial Model Evaluation Metrics:
Accuracy: 81.31%
Precision: 81.34%
Recall: 81.31%
F1 Score: 81.21%

Classification Report (Initial Model):
                 precision    recall  f1-score   support

ARTS & CULTURE       0.84      0.89      0.87       205
      BUSINESS       0.74      0.76      0.75       114
        COMEDY       0.75      0.88      0.81        74
         CRIME       0.84      0.81      0.82        57
     EDUCATION       0.86      0.88      0.87       108
 ENTERTAINMENT       0.82      0.75      0.78       100
   ENVIRONMENT       0.85      0.85      0.85        97
         MEDIA       0.84      0.72      0.77        67
      POLITICS       0.75      0.77      0.76       103
      RELIGION       0.86      0.83      0.84       101
       SCIENCE       0.77      0.81      0.79       

In [7]:

# Step 4: applying Hyperparameter Tuning with GridSearchCV
# Definining the parameter grid for SVM
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']}

# Perform GridSearch for best parameters
grid_search = GridSearchCV(SVC(random_state=42), param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_train_tfidf, y_train)

# Display the best parameters
print("\nBest Parameters from GridSearch:", grid_search.best_params_)

# Save the best model from GridSearchCV 
best_model= grid_search.best_estimator_

# Perform cross-validation with the model after the hyperparameter tuning(optimised model)
cv_scores_optimized = cross_val_score(best_model, X_train_tfidf, y_train, cv=5, scoring='accuracy')

# Print cross-validation results for the optimized model
print(f"\nCross-Validation Accuracy Scores (Optimized Model): {cv_scores_optimized}")
print(f"Mean Accuracy (Optimized Model): {cv_scores_optimized.mean():.4f}")
print(f"Standard Deviation (Optimized Model): {cv_scores_optimized.std():.4f}")


Fitting 5 folds for each of 12 candidates, totalling 60 fits

Best Parameters from GridSearch: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}

Cross-Validation Accuracy Scores (Optimized Model): [0.80545455 0.80272727 0.78434941 0.78252957 0.80800728]
Mean Accuracy (Optimized Model): 0.7966
Standard Deviation (Optimized Model): 0.0109


In [8]:

# Step 5: Evaluate the  SVM Model after the hyperparameter tuning

# Evaluate the  model on the test set
y_pred_optimized = best_model.predict(X_test_tfidf)

# Calculate the performance measures for the optimized model
optimized_accuracy = accuracy_score(y_test, y_pred_optimized)
optimized_precision = precision_score(y_test, y_pred_optimized, average='weighted')
optimized_recall = recall_score(y_test, y_pred_optimized, average='weighted')
optimized_f1 = f1_score(y_test, y_pred_optimized, average='weighted')

# Display the performance for the optimized model
print("\nOptimized Model Evaluation Metrics:")
print(f"Accuracy: {optimized_accuracy * 100:.2f}%")
print(f"Precision:{optimized_precision * 100:.2f}%")
print(f"Recall: {optimized_recall * 100:.2f}%")
print(f"F1 Score: {optimized_f1 * 100:.2f}%")

# Print the classification report for the optimized model
print("\nClassification Report (Optimized Model):\n", classification_report(y_test, y_pred_optimized))




Optimized Model Evaluation Metrics:
Accuracy: 81.82%
Precision: 81.94%
Recall: 81.82%
F1 Score: 81.75%

Classification Report (Optimized Model):
                 precision    recall  f1-score   support

ARTS & CULTURE       0.83      0.90      0.86       205
      BUSINESS       0.80      0.73      0.76       114
        COMEDY       0.75      0.88      0.81        74
         CRIME       0.80      0.82      0.81        57
     EDUCATION       0.87      0.90      0.88       108
 ENTERTAINMENT       0.81      0.78      0.80       100
   ENVIRONMENT       0.85      0.82      0.84        97
         MEDIA       0.84      0.72      0.77        67
      POLITICS       0.73      0.80      0.76       103
      RELIGION       0.88      0.81      0.85       101
       SCIENCE       0.75      0.78      0.76        54
        SPORTS       0.92      0.92      0.92       101
          TECH       0.79      0.75      0.77        85
         WOMEN       0.79      0.72      0.76       109

      accur

In [10]:

# Step 6: Saving the results to a CSV file
output_file = r"C:\Users\mahamat ali\AppData\Roaming\nltk_data\corpora\names\svm_gridsearch_results.csv"

# Create a DataFrame for results
results = pd.DataFrame({
    'Actual': y_test,
    'Initial_Predicted': y_pred_initial,
    'Optimized_Predicted': y_pred_optimized,
    'Initial_CV_Mean_Accuracy':[cv_scores_initial.mean()] * len(y_test),
'Optimized_CV_Mean_Accuracy': [cv_scores_optimized.mean()] * len(y_test)

})

# Save the results to a CSV file
results.to_csv(output_file, index=False)
print(f"\nResults saved to {output_file}")




Results saved to C:\Users\mahamat ali\AppData\Roaming\nltk_data\corpora\names\svm_gridsearch_results.csv
