Getting 7000 articles for Data

In [None]:
import requests
import pandas as pd
import time

# Your Guardian API key
API_KEY = 'e8181f97-dcf6-4085-8f72-2fb34cb47f3e'  # replace with your actual API key
BASE_URL = 'https://content.guardianapis.com/search'

# Define the target categories and the number of articles to fetch per category
target_categories = ["Life and style", "Law", "Film", "Books", "Art and design", "Games", "Travel"]
ARTICLES_PER_CATEGORY = 1000
BATCH_SIZE = 12

# Function to fetch articles from The Guardian API
def fetch_articles(api_key, query, page_size=12, page=1):
    url = BASE_URL
    params = {
        'q': query,
        'api-key': api_key,
        'show-fields': 'bodyText',  # Fetches article content
        'page-size': page_size,
        'page': page
    }
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching articles: {response.status_code}")
        return None

# Data storage
all_articles = []

# Collect 1,000 articles for each specified category
for category in target_categories:
    print(f"Gathering articles for category: {category}")
    category_articles = []
    page = 1

    while len(category_articles) < ARTICLES_PER_CATEGORY:
        # Fetch a batch of articles for the chosen category
        data = fetch_articles(API_KEY, category, page_size=BATCH_SIZE, page=page)
        if data and 'response' in data:
            results = data['response']['results']
            for result in results:
                if len(category_articles) >= ARTICLES_PER_CATEGORY:
                    break  # Stop if we have 1,000 articles for this category

                # Store the article content and assign the specific category
                article = {
                    'category': category,
                    'content': result['fields'].get('bodyText', '')
                }
                category_articles.append(article)

            # Pause for 1 second after each batch of 12 articles
            time.sleep(1)
            print(len(category_articles))

        # Move to the next page for more results
        page += 1

    # Add articles from this category to the main list
    all_articles.extend(category_articles)
    print(f"Collected {len(category_articles)} articles for category: {category}")

# Convert list of articles to a DataFrame
df = pd.DataFrame(all_articles)

# Save DataFrame to CSV
df.to_csv('guardian_articles_7000.csv', index=False)

# Display first few rows of the DataFrame
df.head()


Counting the articles in each category

In [None]:
import pandas as pd

# Load the CSV file into a pandas DataFrame
df = pd.read_csv('guardian_articles_7000.csv')

# Count the occurrences of each category in the 'category' column
category_counts = df['category'].value_counts()

# Display the result
print(category_counts)


category
Life and style    1000
Law               1000
Film              1000
Books             1000
Art and design    1000
Games             1000
Travel            1000
Name: count, dtype: int64


Encode the article content

In [None]:
import pandas as pd
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def process_text(text):

    # Check if text is not a string
    if not isinstance(text, str):
        return ""  # Return empty string or you could use return None

    # Define punctuation to remove
    punctuation_extended = string.punctuation + "‘’“”!@#$%^&*()`~-_=+[]{}\|:;<>,.?/–..."  # add specific quote characters

    # Remove punctuation
    translator = str.maketrans('', '', punctuation_extended)
    text = text.translate(translator)

    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

    # Return processed text
    return " ".join(stemmed_tokens)

df = pd.read_csv('guardian_articles_7000.csv')

df['content'] = df['content'].apply(process_text)

df.to_csv('guardian_articles_encoded.csv', index=False)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Apply count vectorization to the encoded file

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import gc  # For manual garbage collection
import time

# Load the CSV file (adjust the file path as needed)
file_path = 'guardian_articles_encoded.csv'  # Replace with your actual file path
df = pd.read_csv(file_path)

# Remove rows where 'processed_content' has NaN values
df = df.dropna(subset=['content'])

# Initialize the vectorizers with limited features (fitting on the entire dataset)
count_vectorizer = CountVectorizer(max_features=5000, max_df=0.8, min_df=10)

# Fit both vectorizers on the entire 'processed_content' column to fix vocabulary
count_vectorizer.fit(df['content'])

# Define batch size
batch_size = 1000  # Adjust batch size based on available memory

# Create an empty list to store vectorized results
all_batches = []

# Process the data in batches to control memory usage
for i in range(0, len(df), batch_size):
    print(f"Processing batch {i // batch_size + 1}")

    # Get the batch of data
    batch_df = df[i:i + batch_size]

    # Apply Count Vectorization on the batch (sparse matrix)
    count_matrix = count_vectorizer.transform(batch_df['content'])

    # Convert to DataFrame (with column prefixes to distinguish)
    count_df = pd.DataFrame(count_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())

    # Add prefix to distinguish TF-IDF and Count columns
    count_df = count_df.add_prefix('count_')

    # Join the vectorized data with the original batch dataframe
    batch_result = batch_df.reset_index(drop=True).join(count_df)

    # Append the result to the final list
    all_batches.append(batch_result)

    # Clear memory of the matrices after processing each batch
    del count_matrix, count_df
    gc.collect()  # Manually trigger garbage collection

    # Adding sleep time to control system resources if needed
    time.sleep(1)  # Adjust sleep duration based on your system

# Concatenate all processed batches into a single DataFrame
final_df = pd.concat(all_batches, ignore_index=True)

# Save the final DataFrame with vectorized columns to a CSV file
output_file = 'guardian_articles_count.csv'
final_df.to_csv(output_file, index=False)

# Display the first few rows of the new DataFrame to ensure it worked correctly
final_df.head()


Processing batch 1
Processing batch 2
Processing batch 3
Processing batch 4
Processing batch 5
Processing batch 6
Processing batch 7


Unnamed: 0,category,content,count_00,count_10,count_100,count_1000,count_10000,count_100000,count_100m,count_10m,...,count_youngest,count_your,count_youth,count_youtub,count_youv,count_zealand,count_zelenskiy,count_zero,count_zone,count_zoom
0,Life and style,orchard offer bounti appl autumn last year wen...,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Life and style,born bologna either 1639 1643 sourc conflict a...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Life and style,victoria embank central london bench overlook ...,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,1
3,Life and style,letizia battaglia took pictur distanc punch ca...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Life and style,mexico love pork well document countri influen...,0,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Apply tf-idf vectorization to the encoded file

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import gc  # For manual garbage collection
import time

# Load the CSV file (adjust the file path as needed)
file_path = 'guardian_articles_encoded.csv'  # Replace with your actual file path
df = pd.read_csv(file_path)

# Remove rows where 'content' has NaN values
df = df.dropna(subset=['content'])

# Initialize the TF-IDF vectorizer with limited features (fitting on the entire dataset)
tfidf_vectorizer = TfidfVectorizer(max_features=5000, max_df=0.8, min_df=10)

# Fit the vectorizer on the entire 'content' column to fix vocabulary
tfidf_vectorizer.fit(df['content'])

# Define batch size
batch_size = 1000  # Adjust batch size based on available memory

# Create an empty list to store vectorized results
all_batches = []

# Process the data in batches to control memory usage
for i in range(0, len(df), batch_size):
    print(f"Processing batch {i // batch_size + 1}")

    # Get the batch of data
    batch_df = df[i:i + batch_size]

    # Apply TF-IDF Vectorization on the batch (sparse matrix)
    tfidf_matrix = tfidf_vectorizer.transform(batch_df['content'])

    # Convert to DataFrame (with column prefixes to distinguish)
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

    # Add prefix to distinguish TF-IDF columns
    tfidf_df = tfidf_df.add_prefix('tfidf_')

    # Join the vectorized data with the original batch dataframe
    batch_result = batch_df.reset_index(drop=True).join(tfidf_df)

    # Append the result to the final list
    all_batches.append(batch_result)

    # Clear memory of the matrices after processing each batch
    del tfidf_matrix, tfidf_df
    gc.collect()  # Manually trigger garbage collection

    # Adding sleep time to control system resources if needed
    time.sleep(1)  # Adjust sleep duration based on your system

# Concatenate all processed batches into a single DataFrame
final_df = pd.concat(all_batches, ignore_index=True)

# Save the final DataFrame with vectorized columns to a CSV file
output_file = 'guardian_articles_tfidf.csv'
final_df.to_csv(output_file, index=False)

# Display the first few rows of the new DataFrame to ensure it worked correctly
final_df.head()


Processing batch 1
Processing batch 2
Processing batch 3
Processing batch 4
Processing batch 5
Processing batch 6
Processing batch 7


Unnamed: 0,category,content,tfidf_00,tfidf_10,tfidf_100,tfidf_1000,tfidf_10000,tfidf_100000,tfidf_100m,tfidf_10m,...,tfidf_youngest,tfidf_your,tfidf_youth,tfidf_youtub,tfidf_youv,tfidf_zealand,tfidf_zelenskiy,tfidf_zero,tfidf_zone,tfidf_zoom
0,Life and style,orchard offer bounti appl autumn last year wen...,0.0,0.029125,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Life and style,born bologna either 1639 1643 sourc conflict a...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Life and style,victoria embank central london bench overlook ...,0.0,0.0,0.0,0.0,0.0,0.033525,0.0,0.0,...,0.0,0.01807,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.032068
3,Life and style,letizia battaglia took pictur distanc punch ca...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Life and style,mexico love pork well document countri influen...,0.0,0.055213,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Finally, apply the regressions on both and count vectorized and the td-idf vectorized

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from tqdm import tqdm

# File paths for Count Vectorized and TF-IDF Vectorized datasets
file_paths = {
    'Count Vectorized': 'guardian_articles_count.csv',
    'TF-IDF Vectorized': 'guardian_articles_tfidf.csv'
}

# Initialize the LabelEncoder
le = LabelEncoder()

# Step 1: Drop the 'content' column and label encode the 'category' column in each dataset file
for vectorization_type, file_path in file_paths.items():
    print(f"Processing {vectorization_type} dataset for initial cleanup...")

    # Load the dataset
    data = pd.read_csv(file_path)

    # Drop rows with missing values
    data = data.dropna()

    # Drop the 'content' column
    data = data.drop(columns=['content'])

    # Apply label encoding to the 'category' column
    data['category'] = le.fit_transform(data['category'])

    # Save the updated DataFrame back to the same file
    data.to_csv(file_path, index=False)

    # Optional: Save the mapping of label encoding for future reference
    label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print(f"Initial processing completed for {vectorization_type} dataset.")
    print(f"Category Label Mapping for {vectorization_type}: {label_mapping}\n")

# Step 2: Define the models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'Naive Bayes': MultinomialNB(),
    'SVM': SVC(probability=True),
    'Gradient Boosting': GradientBoostingClassifier(),
    'k-NN': KNeighborsClassifier(),
    'Neural Network': MLPClassifier(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'SGD Classifier': SGDClassifier(),
    'AdaBoost': AdaBoostClassifier()
}

# Initialize a dictionary to store all evaluation results
all_results = {}

# Step 3: Loop through each dataset and evaluate models
for vectorization_type, file_path in file_paths.items():
    print(f"\nProcessing dataset: {vectorization_type}")

    # Load the cleaned dataset without the 'content' column
    data = pd.read_csv(file_path)

    # Prepare the data for training
    X = data.drop(columns=['category'])  # Feature columns
    y = data['category']  # Encoded category labels

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Dictionary to store results for the current vectorization type
    evaluation_results = {}

    # Step 4: Train, predict, and evaluate each model
    for model_name, model in tqdm(models.items(), desc=f"Training models on {vectorization_type}"):
        print(f"Training and evaluating model: {model_name}")

        # Train the model
        model.fit(X_train, y_train)

        # Predict the labels
        y_pred = model.predict(X_test)

        # Calculate metrics
        metrics = {}
        metrics['Accuracy'] = accuracy_score(y_test, y_pred)
        metrics['Precision'] = precision_score(y_test, y_pred, average='weighted')
        metrics['Recall'] = recall_score(y_test, y_pred, average='weighted')
        metrics['F1 Score'] = f1_score(y_test, y_pred, average='weighted')
        metrics['Confusion Matrix'] = confusion_matrix(y_test, y_pred)

        # ROC-AUC score (only available for certain models)
        if hasattr(model, "predict_proba"):
            y_proba = model.predict_proba(X_test)
            metrics['ROC-AUC'] = roc_auc_score(y_test, y_proba, multi_class='ovr')
        else:
            metrics['ROC-AUC'] = None

        # Store the results
        evaluation_results[model_name] = metrics

    # Add the evaluation results to the main results dictionary
    all_results[vectorization_type] = evaluation_results

# Step 5: Print all evaluation results in a clear format
for vectorization_type, results in all_results.items():
    print(f"\n\n=== Evaluation Results for {vectorization_type} Dataset ===\n")

    for model_name, metrics in results.items():
        print(f"\nModel: {model_name}")
        print(f"Accuracy: {metrics['Accuracy']:.4f}")
        print(f"Precision: {metrics['Precision']:.4f}")
        print(f"Recall: {metrics['Recall']:.4f}")
        print(f"F1 Score: {metrics['F1 Score']:.4f}")

        if metrics['ROC-AUC'] is not None:
            print(f"ROC-AUC: {metrics['ROC-AUC']:.4f}")
        else:
            print("ROC-AUC: Not available for this model")

        print(f"Confusion Matrix:\n{metrics['Confusion Matrix']}\n")


Processing Count Vectorized dataset for initial cleanup...
Initial processing completed for Count Vectorized dataset.
Category Label Mapping for Count Vectorized: {'Art and design': 0, 'Books': 1, 'Film': 2, 'Games': 3, 'Law': 4, 'Life and style': 5, 'Travel': 6}

Processing TF-IDF Vectorized dataset for initial cleanup...
Initial processing completed for TF-IDF Vectorized dataset.
Category Label Mapping for TF-IDF Vectorized: {'Art and design': 0, 'Books': 1, 'Film': 2, 'Games': 3, 'Law': 4, 'Life and style': 5, 'Travel': 6}


Processing dataset: Count Vectorized


Training models on Count Vectorized:   0%|          | 0/10 [00:00<?, ?it/s]

Training and evaluating model: Logistic Regression


Training models on Count Vectorized:  10%|█         | 1/10 [00:47<07:05, 47.28s/it]

Training and evaluating model: Random Forest


Training models on Count Vectorized:  20%|██        | 2/10 [00:59<03:31, 26.50s/it]

Training and evaluating model: Naive Bayes


Training models on Count Vectorized:  30%|███       | 3/10 [00:59<01:42, 14.65s/it]

Training and evaluating model: SVM


Training models on Count Vectorized:  40%|████      | 4/10 [09:46<21:41, 216.86s/it]

Training and evaluating model: Gradient Boosting


Training models on Count Vectorized:  50%|█████     | 5/10 [14:42<20:27, 245.40s/it]

Training and evaluating model: k-NN


Training models on Count Vectorized:  60%|██████    | 6/10 [14:52<11:00, 165.20s/it]

Training and evaluating model: Neural Network


Training models on Count Vectorized:  70%|███████   | 7/10 [15:42<06:23, 127.70s/it]

Training and evaluating model: Decision Tree


Training models on Count Vectorized:  80%|████████  | 8/10 [15:48<02:57, 88.93s/it] 

Training and evaluating model: SGD Classifier




Training and evaluating model: AdaBoost


Training models on Count Vectorized: 100%|██████████| 10/10 [16:16<00:00, 97.63s/it]



Processing dataset: TF-IDF Vectorized


Training models on TF-IDF Vectorized:   0%|          | 0/10 [00:00<?, ?it/s]

Training and evaluating model: Logistic Regression


Training models on TF-IDF Vectorized:  10%|█         | 1/10 [00:14<02:14, 14.93s/it]

Training and evaluating model: Random Forest


Training models on TF-IDF Vectorized:  20%|██        | 2/10 [00:28<01:52, 14.10s/it]

Training and evaluating model: Naive Bayes


Training models on TF-IDF Vectorized:  30%|███       | 3/10 [00:28<00:54,  7.81s/it]

Training and evaluating model: SVM


Training models on TF-IDF Vectorized:  40%|████      | 4/10 [11:35<26:47, 267.97s/it]

Training and evaluating model: Gradient Boosting


Training models on TF-IDF Vectorized:  50%|█████     | 5/10 [22:03<33:08, 397.63s/it]

Training and evaluating model: k-NN


Training models on TF-IDF Vectorized:  60%|██████    | 6/10 [22:12<17:43, 265.76s/it]

Training and evaluating model: Neural Network


Training models on TF-IDF Vectorized:  70%|███████   | 7/10 [24:30<11:11, 223.90s/it]

Training and evaluating model: Decision Tree


Training models on TF-IDF Vectorized:  80%|████████  | 8/10 [24:39<05:10, 155.31s/it]

Training and evaluating model: SGD Classifier




Training and evaluating model: AdaBoost


Training models on TF-IDF Vectorized: 100%|██████████| 10/10 [25:11<00:00, 151.19s/it]



=== Evaluation Results for Count Vectorized Dataset ===


Model: Logistic Regression
Accuracy: 0.7281
Precision: 0.7280
Recall: 0.7281
F1 Score: 0.7280
ROC-AUC: 0.9262
Confusion Matrix:
[[184  14  11  11   9  45  11]
 [ 15 242   8  12   6  17  11]
 [ 13   6 231   6   3  22   8]
 [  8   1   5 235   8  20  17]
 [  9   6   4   7 263  18  12]
 [ 47  25  29  16  16 154  22]
 [  6  10   5  15   6  29 215]]


Model: Random Forest
Accuracy: 0.7430
Precision: 0.7359
Recall: 0.7430
F1 Score: 0.7328
ROC-AUC: 0.9351
Confusion Matrix:
[[183  10  15  21  19  30   7]
 [ 12 249   8  15  10  11   6]
 [  8   6 248   2   6  13   6]
 [  1   4   8 260   7   7   7]
 [  1   4   6   6 289   8   5]
 [ 36  28  39  31  26 121  28]
 [  5  13   6  22  25  10 205]]


Model: Naive Bayes
Accuracy: 0.6579
Precision: 0.6513
Recall: 0.6579
F1 Score: 0.6481
ROC-AUC: 0.8864
Confusion Matrix:
[[162  14  16  25  22  36  10]
 [ 10 221  15  18  23  14  10]
 [ 12   8 239   1  10  16   3]
 [  4   5  13 225  18  14  15]
 [  4 


