## 1.Loading Dataset And Data manipulation

In [40]:
import pandas as pd

In [41]:
csv_file_path='dataset.csv'

In [42]:
df=pd.read_csv(csv_file_path)

In [None]:
print('\n 10 rows of dataset')
print(df.head(5))#returns first five rows of datset


 10 rows of dataset
    id                                          title  \
0  370              Himalayan Brown Bear Conservation   
1  371    Clouded Leopard Research in Nepal's Forests   
2  372        Wild Water Buffalo Conservation Efforts   
3  373      Fishing Cat Conservation in Wetland Areas   
4  374  Gangetic Dolphin Conservation in Nepal Rivers   

                                             content               category  
0  Himalayan brown bear conservation in Nepal's h...  wildlife/nationalpark  
1  Clouded leopard research in Nepal's forest pro...  wildlife/nationalpark  
2  Wild water buffalo conservation in Nepal's Ter...  wildlife/nationalpark  
3  Fishing cat conservation in Nepal's wetland pr...  wildlife/nationalpark  
4  Gangetic dolphin conservation in Nepal's river...  wildlife/nationalpark  


In [None]:
df.info()#returns the info of dataset its total no index and columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5849 entries, 0 to 5848
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        5849 non-null   object
 1   title     5735 non-null   object
 2   content   5636 non-null   object
 3   category  5849 non-null   object
dtypes: object(4)
memory usage: 182.9+ KB


In [45]:
print(df.columns.to_list())#name of columns

['id', 'title', 'content', 'category']


In [46]:
content_label='content'
category_label='category'

print(df[category_label].value_counts())

blog_contents = df[content_label].tolist()
blog_category = df[category_label].tolist()

print(f"\nLoaded {len(blog_contents)} blog contents and {len(blog_category)} labels.")



category
food/cuisine             1538
wildlife/nationalpark    1491
hike/trek                1419
culture                  1401
Name: count, dtype: int64

Loaded 5849 blog contents and 5849 labels.


In [47]:
missing_values_count = df.isnull().sum()#checks the empty column
print("\nNumber of missing values per column:")
print(missing_values_count)


Number of missing values per column:
id            0
title       114
content     213
category      0
dtype: int64


In [None]:
print('Shape of the dataframe before dropping missing values', df.shape)
#removes the missing values NaN or null
df.dropna(subset=['content'], inplace=True)
df.dropna(subset=['title'], inplace=True)
values_after = df.isnull().sum()
print(values_after)
print('Shape of the dataframe after dropping missing values', df.shape)

Shape of the dataframe before dropping missing values (5849, 4)
id          0
title       0
content     0
category    0
dtype: int64
Shape of the dataframe after dropping missing values (5522, 4)


In [49]:
print(df.info)

<bound method DataFrame.info of                 id                                              title  \
0              370                  Himalayan Brown Bear Conservation   
1              371        Clouded Leopard Research in Nepal's Forests   
2              372            Wild Water Buffalo Conservation Efforts   
3              373          Fishing Cat Conservation in Wetland Areas   
4              374      Gangetic Dolphin Conservation in Nepal Rivers   
...            ...                                                ...   
5844  02c0e88c-64d                                               Blog   
5845  8530b65d-761  Best Trekking Routes In Nepal | Adventure Nepa...   
5846  a051c0b9-a1e                   Popular trekking routes in Nepal   
5847  fb83f3cf-79d                                     No title found   
5848  acd01562-939                 6 Best Trekking Boots for EBC Trek   

                                                content               category  
0     Hima

In [50]:
print(df['content'].head(10))

0    Himalayan brown bear conservation in Nepal's h...
1    Clouded leopard research in Nepal's forest pro...
2    Wild water buffalo conservation in Nepal's Ter...
3    Fishing cat conservation in Nepal's wetland pr...
4    Gangetic dolphin conservation in Nepal's river...
5    The Everest Base Camp Trek is a dream for many...
6    The Annapurna Circuit is renowned for its dive...
7    The Langtang Valley Trek is perfect for those ...
8    The Manaslu Circuit Trek is a hidden gem for t...
9    The Ghorepani Poon Hill Trek is famous for its...
Name: content, dtype: object


### 2. Train_Test Split

In [65]:
random_state=42
test_size=0.2

df_shuffled = df.sample(frac=1, random_state=random_state).reset_index(drop=True)

split_idx = int(len(df_shuffled) * (1 - test_size))

train_df = df_shuffled.iloc[:split_idx]
test_df = df_shuffled.iloc[split_idx:]

X_train_content = train_df['content'].tolist()
y_train_label = train_df['category'].tolist()
X_test_content = test_df['content'].tolist()
y_test_label = test_df['category'].tolist()

print(f"Training set size: {len(X_train_content)} documents")
print(f"Testing set size: {len(y_test_label)} documents")

Training set size: 4417 documents
Testing set size: 1105 documents


# 2.Preprocessing the text in datset

###### *lowercase() 
###### *remove punctuation
###### *remove extra whitespace
###### *remove digit
###### *tokenize into words
###### *remove stopword


In [None]:
import re
import string

# import nltk
from nltk.tokenize import word_tokenize
# nltk.download('punkt')

from nltk.corpus import stopwords
# nltk.download('stopwords')


def preprocess_text(text,use_ngrams=False,ngrams_range=(1,1)):

    if not isinstance(text,str):
       return "Given input is not a string"
    
    #convert text into lowercase
    text=text.lower()

    #remove punctuation :string module contains string.punctuation=!'#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)

    #remove digit from text
    text=re.sub(r"[0-9]+"," ",text)

    # remove multiple white spaces :strip()removes leading and trailing whitespace from text
    text = re.sub(r"\s+", " ", text).strip()

    #tokenize the text into words
    tokenize_word=word_tokenize(text)

    #removing stopwords 
    # 1st setting a stopwords in english
    stop_words=set(stopwords.words('english'))
    
    # word for word in tokenize_word:iterates each word in tokenize_word  1st condition:if not stopwords and 2nd:if len>1 i.e a x y
    filtered_tokens = [word for word in tokenize_word if word not in stop_words and len(word) >1] 

    # return filtered_tokens

    if use_ngrams:
        all_ngrams=[]
        for n in range(ngrams_range[0],ngrams_range[1]+1):
            if n==1:
                all_ngrams.extend(filtered_tokens)
            elif n>1:
                for i in range(len(filtered_tokens) -n+1):#kati choti loop chalaune for n>1
                    all_ngrams.append("_".join(filtered_tokens[i:i+n]))
        return all_ngrams
    return filtered_tokens

preprocessed_train_tokens = [preprocess_text(text, use_ngrams=True, ngrams_range=(1,2)) for text in X_train_content]
print(preprocessed_train_tokens[0])

   




['summer', 'trek', 'ama', 'yangri', 'nakote', 'via', 'ghyangul', 'green', 'vibrant', 'lively', 'village', 'life', 'summer_trek', 'trek_ama', 'ama_yangri', 'yangri_nakote', 'nakote_via', 'via_ghyangul', 'ghyangul_green', 'green_vibrant', 'vibrant_lively', 'lively_village', 'village_life']


## 3.Vocabulary Building

In [None]:

def build_vocabulary(all_processed_docs_tokens, max_features=7000, min_doc_freq=2, max_doc_freq_ratio=0.85):
    word_counts = {}#term frequency yetai vairacha 
    doc_word_presence = {}
    for i, doc_tokens in enumerate(all_processed_docs_tokens):
        for word in doc_tokens:
            word_counts[word] = word_counts.get(word, 0) + 1

        for word in set(doc_tokens):#set removes duplicate words
            doc_word_presence[word] = doc_word_presence.get(word, 0) + 1

    if (i + 1) % 1000 == 0:
            print(f"Processed {i + 1} documents for vocabulary building...")

    # Sort words by their frequency in descending order, then alphabetically for ties
    # (item[1] is the count, item[0] is the word itself)
    sorted_words = sorted(word_counts.items(), key=lambda item: (-item[1], item[0]))
    
    total_docs = len(all_processed_docs_tokens)
    print('The total docs is',total_docs)

    # Filter and build the final vocabulary
    filtered_vocab = {} 
    
    print("Filtering vocabulary based on frequency and document presence...")
    for word, count in sorted_words:
        # Check if the word meets the minimum document frequency requirement
        # and doesn't appear in too many documents (too common)
        word_df = doc_word_presence.get(word, 0) 
        
        if word_df >= min_doc_freq and \
           word_df <= total_docs * max_doc_freq_ratio:
            
            # Assign an index to the word (starting from 0)
            filtered_vocab[word] = len(filtered_vocab)
            
            # Stop if the maximum desired features (vocabulary size) is reached
            if len(filtered_vocab) >= max_features:
                break
    
    print(f"Final vocabulary size: {len(filtered_vocab)}")
    return filtered_vocab

vocabulary=build_vocabulary(preprocessed_train_tokens)
print(vocabulary)
#vocab ma word ra index save huncha

            


The total docs is 4417
Filtering vocabulary based on frequency and document presence...
Final vocabulary size: 7000
{'nepal': 0, 'trek': 1, 'trekking': 2, 'cultural': 3, 'nepali': 4, 'food': 5, 'also': 6, 'people': 7, 'culture': 8, 'one': 9, 'national': 10, 'wildlife': 11, 'country': 12, 'kathmandu': 13, 'best': 14, 'nepalese': 15, 'like': 16, 'park': 17, 'world': 18, 'experience': 19, 'rich': 20, 'unique': 21, 'traditional': 22, 'everest': 23, 'local': 24, 'heritage': 25, 'time': 26, 'many': 27, 'region': 28, 'traditions': 29, 'day': 30, 'festivals': 31, 'national_park': 32, 'adventure': 33, 'himalayas': 34, 'base': 35, 'cuisine': 36, 'known': 37, 'diverse': 38, 'camp': 39, 'himalayan': 40, 'journey': 41, 'home': 42, 'species': 43, 'base_camp': 44, 'different': 45, 'explore': 46, 'popular': 47, 'conservation': 48, 'festival': 49, 'life': 50, 'annapurna': 51, 'chitwan': 52, 'travel': 53, 'every': 54, 'nature': 55, 'make': 56, 'offers': 57, 'rice': 58, 'days': 59, 'mountain': 60, 'valle

## 4.IDF Calculation

In [None]:
import math # imported for math.log10


def calculate_idf(all_processed_docs_tokens, vocabulary):
   
    df = {} 
    total_docs = len(all_processed_docs_tokens)

    for doc_tokens in all_processed_docs_tokens:
        for word in set(doc_tokens):
           
            if word in vocabulary:
                df[word] = df.get(word, 0) + 1

    idf = {}
    for word in vocabulary.keys():
        word_df = df.get(word, 0)
        
        idf[word] = math.log10(total_docs / (word_df + 1)) + 1
        
    return idf

idf_score=calculate_idf(preprocessed_train_tokens,vocabulary)
print(idf_score)

{'nepal': 1.2080113062103126, 'trek': 1.6194120153570504, 'trekking': 1.737178877646119, 'cultural': 1.530516415026218, 'nepali': 1.639089444261074, 'food': 1.704610914325824, 'also': 1.4337398463215323, 'people': 1.527524707568307, 'culture': 1.5170513865896758, 'one': 1.4696155858949436, 'national': 1.6771116852647494, 'wildlife': 1.775895679527415, 'country': 1.5648617719185465, 'kathmandu': 1.6412362330214805, 'best': 1.5321874231743111, 'nepalese': 1.7016328833522887, 'like': 1.5496093569352403, 'park': 1.8772297832403004, 'world': 1.5916647943329358, 'experience': 1.5951345423382484, 'rich': 1.594361088025349, 'unique': 1.5867013748013856, 'traditional': 1.688958968783028, 'everest': 1.80127197663523, 'local': 1.6235246832301489, 'heritage': 1.6976936773713405, 'time': 1.650370454670763, 'many': 1.633556955661113, 'region': 1.7071083017821809, 'traditions': 1.737178877646119, 'day': 1.798790287128586, 'festivals': 1.8764892980107768, 'national_park': 1.8992722040846624, 'adventur

### 4.Naive Bayes Algorithm :Training Model


In [None]:

def train_naive_bayes_model(X_train_tokens, y_train_labels, vocabulary, smoothing_alpha=1.0):
   
   
    all_classes = sorted(list(set(y_train_labels))) 
    total_documents = len(y_train_labels)            

    # 2. Initialize data structures to store counts and priors using standard dictionaries
    class_priors = {c: 0 for c in all_classes}         # Stores raw document counts for each class
    # Stores word counts within each class {class_label: {word: count}}
    class_word_counts = {c: {} for c in all_classes} 
    # Stores total words in each class (will be populated later with smoothing)
    total_words_in_class = {c: 0 for c in all_classes} 

    print("Step 1: Calculating raw class priors and word counts per class...")
    # 3. Calculate raw class priors (document counts per class) and word counts per class
    for i, doc_tokens in enumerate(X_train_tokens):
        label = y_train_labels[i] # Get the class label for the current document
        class_priors[label] += 1  # Increment the document count for this class
        
        # Manually update word counts for this class
        for word in doc_tokens:
            # For the current 'label', increment the count of 'word' in its respective dictionary
            class_word_counts[label][word] = class_word_counts[label].get(word, 0) + 1
        
        if (i + 1) % 1000 == 0:
            print(f"  Processed {i + 1}/{total_documents} documents for class counts.")

    print("Step 2: Converting class priors to probabilities...")
    # 4. Convert raw class priors (document counts) to probabilities P(Class)
    for c in all_classes:
        # P(Class) = (Number of documents in Class) / (Total number of documents)
        class_priors[c] = class_priors[c] / total_documents
    print(f"  Class Priors: {class_priors}")

    # 5. Prepare for P(Word | Class) calculation with Laplace smoothing
    vocab_size = len(vocabulary) # Get the size of the global vocabulary
    word_probabilities = {c: {} for c in all_classes} 

    print("Step 3: Calculating word probabilities P(Word | Class) with Laplace smoothing...")
    # 6. Calculate P(word | class) for each word in the vocabulary for each class
    for c in all_classes:
        # Calculate the total number of words in this class for the denominator of P(Word | Class)
        # This includes smoothing: sum of all actual word counts in the class + (smoothing_alpha * vocabulary size)
        current_class_total_words = 0
        for word_count in class_word_counts[c].values():
            current_class_total_words += word_count
        
        total_words_in_class[c] = current_class_total_words + vocab_size * smoothing_alpha
        
        # Ensure total_words_in_class is not zero, which can happen if a class has no words AND vocab_size is 0, though unlikely.
        if total_words_in_class[c] == 0:
            total_words_in_class[c] = 1 # Prevent division by zero, treat as 1 if no words (very rare edge case)

        # Iterate over the full global vocabulary to calculate probabilities for all potential words
        for word_token in vocabulary: 
            # Get the count of 'word_token' in the current class 'c'. Default to 0 if not seen.
            count = class_word_counts[c].get(word_token, 0)
            
            # Apply Laplace Smoothing for P(word | class)
            # P(word | class) = (Count(word in class) + smoothing_alpha) / (Total words in class + smoothing_alpha * Vocab Size)
            # The denominator 'total_words_in_class[c]' already includes 'smoothing_alpha * Vocab Size'
            word_probabilities[c][word_token] = (count + smoothing_alpha) / total_words_in_class[c]
    
    print("Naive Bayes model training complete.")
    return class_priors, word_probabilities, total_words_in_class

class_priors, word_probabilities, total_words_in_class = train_naive_bayes_model(
        preprocessed_train_tokens, y_train_label, vocabulary)


Step 1: Calculating raw class priors and word counts per class...
  Processed 1000/4417 documents for class counts.
  Processed 2000/4417 documents for class counts.
  Processed 3000/4417 documents for class counts.
  Processed 4000/4417 documents for class counts.
Step 2: Converting class priors to probabilities...
  Class Priors: {'culture': 0.2322843558976681, 'food/cuisine': 0.2490378084672855, 'hike/trek': 0.25922571881367446, 'wildlife/nationalpark': 0.259452116821372}
Step 3: Calculating word probabilities P(Word | Class) with Laplace smoothing...
Naive Bayes model training complete.


##### Initialization:
###### all_classes = ['News', 'Sports'] (after sorting set(['Sports', 'News']))

###### total_documents = 4

###### class_priors = {'News': 0, 'Sports': 0}

###### class_word_counts = {'News': {}, 'Sports': {}}

###### total_words_in_class = {'News': 0, 'Sports': 0}

###### vocab_size = 7 (from len(vocabulary))

###### 2. Calculate Raw Class Priors and Word Counts Per Class:
###### We iterate through X_train_tokens and y_train_labels:

###### Doc 1 (['ball', 'game', 'team'], label 'Sports'):

###### class_priors['Sports'] becomes 1

###### class_word_counts['Sports'] becomes {'ball': 1, 'game': 1, 'team': 1}

###### Doc 2 (['election', 'vote'], label 'News'):

###### class_priors['News'] becomes 1

###### class_word_counts['News'] becomes {'election': 1, 'vote': 1}

###### Doc 3 (['player', 'ball'], label 'Sports'):

###### class_priors['Sports'] becomes 2

###### class_word_counts['Sports'] becomes {'ball': 2, 'game': 1, 'team': 1, 'player': 1}

###### Doc 4 (['president', 'election'], label 'News'):

###### class_priors['News'] becomes 2

###### class_word_counts['News'] becomes {'election': 2, 'vote': 1, 'president': 1}

###### After this step, we have:

###### class_priors = {'News': 2, 'Sports': 2}

###### class_word_counts = { 'News': {'election': 2, 'vote': 1, 'president': 1}, 'Sports': {'ball': 2, 'game': 1, ###### 'team': 1, 'player': 1} }

###### 3. Convert Class Priors to Probabilities P(Class):
###### For 'News': class_priors['News'] = 2 / 4 = 0.5

###### For 'Sports': class_priors['Sports'] = 2 / 4 = 0.5

###### Resulting class_priors:
###### {'News': 0.5, 'Sports': 0.5}

###### 4. Calculate P(Word | Class) with Laplace Smoothing:
###### Now we calculate word_probabilities using smoothing_alpha = 1.0 and vocab_size = 7.

###### For Class 'News':

###### Sum of raw words in 'News' class: 2 ('election') + 1 ('vote') + 1 ('president') = 4

###### Smoothed Denominator for 'News': total_words_in_class['News'] = 4 + (7 * 1.0) = 11

###### Calculating P(Word | News) for each word in vocabulary:

###### P('ball' | News): (count('ball' in News) + 1) / 11 = (0 + 1) / 11 = 0.0909 (approx)

###### P('game' | News): (0 + 1) / 11 = 0.0909

###### P('team' | News): (0 + 1) / 11 = 0.0909

###### P('election' | News): (2 + 1) / 11 = 3 / 11 = 0.2727 (approx)

###### P('vote' | News): (1 + 1) / 11 = 2 / 11 = 0.1818 (approx)

###### P('player' | News): (0 + 1) / 11 = 0.0909

###### P('president' | News): (1 + 1) / 11 = 2 / 11 = 0.1818 (approx)

###### For Class 'Sports':

###### Sum of raw words in 'Sports' class: 2 ('ball') + 1 ('game') + 1 ('team') + 1 ('player') = 5

###### Smoothed Denominator for 'Sports': total_words_in_class['Sports'] = 5 + (7 * 1.0) = 12

###### Calculating P(Word | Sports) for each word in vocabulary:

###### P('ball' | Sports): (2 + 1) / 12 = 3 / 12 = 0.25

###### P('game' | Sports): (1 + 1) / 12 = 2 / 12 = 0.1667 (approx)

###### P('team' | Sports): (1 + 1) / 12 = 2 / 12 = 0.1667 (approx)

###### P('election' | Sports): (0 + 1) / 12 = 1 / 12 = 0.0833 (approx)

###### P('vote' | Sports): (0 + 1) / 12 = 0.0833

###### P('player' | Sports): (1 + 1) / 12 = 2 / 12 = 0.1667 (approx)

###### P('president' | Sports): (0 + 1) / 12 = 0.0833

## 5.Testing on test data

#### Preprocess testing data

In [67]:
processed_test_tokens = [preprocess_text(text, use_ngrams=True, ngrams_range=(1,2)) for text in X_test_content]
print(processed_test_tokens)



#### Predicting 


In [68]:
def predict_single_blog(doc_tokens, vocabulary, class_priors, word_probabilities, total_words_in_class, smoothing_alpha=1.0):
    """
    Predicts the class label for a single document.
    doc_tokens: list of preprocessed tokens for the document.
    vocabulary: The global vocabulary (OrderedDict).
    class_priors: Dictionary of class prior probabilities.
    word_probabilities: Dictionary of word probabilities P(word | class).
    total_words_in_class: Dictionary of total word counts per class (for smoothing).
    smoothing_alpha: Laplace smoothing parameter.
    Returns: The predicted class label.
    """
    best_class = None
    max_log_posterior = -float('inf')

    for c, prior_prob in class_priors.items():
        # Use log probabilities to avoid underflow
        log_posterior = math.log(prior_prob)

        for word_token in doc_tokens:
            # Only consider words present in the training vocabulary
            if word_token in vocabulary:
                # Get P(word | class) for this word in this class
                word_prob = word_probabilities[c].get(word_token, smoothing_alpha / total_words_in_class[c])
                log_posterior += math.log(word_prob)

        if log_posterior > max_log_posterior:
            max_log_posterior = log_posterior
            best_class = c
    return best_class

predictions = []
for doc_tokens in processed_test_tokens:
    predicted_label = predict_single_blog(doc_tokens, vocabulary, class_priors, word_probabilities, total_words_in_class)
    predictions.append(predicted_label)
print("Predictions made.")

Predictions made.


## 6.Evaluation 


In [72]:
from collections import Counter

def calculate_accuracy(y_true, y_pred):
    """Calculates accuracy score."""
    correct_predictions = sum(1 for true, pred in zip(y_true, y_pred) if true == pred)
    return correct_predictions / len(y_true) if len(y_true) > 0 else 0.0

def calculate_precision_recall_f1(y_true, y_pred, labels):
    """Calculates precision, recall, and F1-score for each label and macro average."""
    metrics = {}
    true_positives = Counter()
    false_positives = Counter()
    false_negatives = Counter()

    for true_label, pred_label in zip(y_true, y_pred):
        if true_label == pred_label:
            true_positives[true_label] += 1
        else:
            false_positives[pred_label] += 1
            false_negatives[true_label] += 1

    for label in labels:
        tp = true_positives[label]
        fp = false_positives[label]
        fn = false_negatives[label]

        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
        
        metrics[label] = {
            'precision': precision,
            'recall': recall,
            'f1-score': f1,
            'support': y_true.count(label) # Number of occurrences of the label in y_true
        }

    # Macro average (simple average of per-class metrics)
    total_precision = sum(metrics[label]['precision'] for label in labels)
    total_recall = sum(metrics[label]['recall'] for label in labels)
    total_f1 = sum(metrics[label]['f1-score'] for label in labels)
    
    num_labels = len(labels)
    if num_labels > 0:
        metrics['macro avg'] = {
            'precision': total_precision / num_labels,
            'recall': total_recall / num_labels,
            'f1-score': total_f1 / num_labels,
            'support': len(y_true) # Total samples in the dataset for macro avg support
        }
    else:
        metrics['macro avg'] = {'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0}

    return metrics

accuracy = calculate_accuracy(y_test_label, predictions)
print(f"\nOverall Accuracy: {accuracy:.4f}")

unique_labels = sorted(list(set(y_test_label + predictions)))
metrics = calculate_precision_recall_f1(y_test_label, predictions, unique_labels)
print("\nClassification Report:")
print(f"{'Category':<25}{'Precision':>10}{'Recall':>10}{'F1-Score':>10}{'Support':>10}")
print("-" * 65)
    
for label in unique_labels:
    m = metrics.get(label, {'precision':0, 'recall':0, 'f1-score':0, 'support':0})
    print(f"{label:<25}{m['precision']:>10.2f}{m['recall']:>10.2f}{m['f1-score']:>10.2f}{m['support']:>10}")
    
if 'macro avg' in metrics:
    macro_m = metrics['macro avg']
    print("-" * 65)
    print(f"{'Macro Average':<25}{macro_m['precision']:>10.2f}{macro_m['recall']:>10.2f}{macro_m['f1-score']:>10.2f}{macro_m['support']:>10}")


Overall Accuracy: 0.8118

Classification Report:
Category                  Precision    Recall  F1-Score   Support
-----------------------------------------------------------------
culture                        0.78      0.76      0.77       266
food/cuisine                   0.85      0.70      0.77       300
hike/trek                      0.79      0.96      0.86       257
wildlife/nationalpark          0.83      0.85      0.84       282
-----------------------------------------------------------------
Macro Average                  0.81      0.82      0.81      1105


## 7. Saving a trained model


In [74]:
import pickle
def save_model_params(vocabulary, idf_scores, class_priors, word_probabilities, total_words_in_class, filename="blog_categorization.pkl"):
    """Save the trained model parameters to a file."""
    with open(filename, "wb") as f:
        pickle.dump({
            "vocabulary": vocabulary,
            "idf_scores": idf_scores,
            "class_priors": class_priors,
            "word_probabilities": word_probabilities,
            "total_words_in_class": total_words_in_class
        }, f)
    print(f"Model parameters saved to {filename}")

save_model_params(vocabulary, idf_score, class_priors, word_probabilities, total_words_in_class)

Model parameters saved to blog_categorization.pkl
