In [15]:
import re
import string
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from collections import defaultdict, Counter

In [2]:
data = {
    "Sentence": [
        "Check out https://example.com for more info!",
        "Order 3 items, get 1 free! Limited offer!!!",
        "Your package #12345 will arrive tomorrow.",
        "Win $1000 now, visit http://winbig.com!!!",
        "Meeting at 3pm, don't forget to bring the files.",
        "Exclusive deal for you: buy 2, get 1 free!!!",
        "Download the report from https://reports.com.",
        "The meeting is starting in 10 minutes.",
        "Reminder: submit your timesheet by 5pm today."
    ],
    "Label": [
        "Inform", "Promo", "Inform", "Promo",
        "Reminder", "Promo", "Inform", "Reminder", "Reminder"
    ]
}

df = pd.DataFrame(data)

In [3]:
def preprocess(text):
    text = text.lower()

    # Replace URLs with special token
    text = re.sub(r'http\S+|www\S+|https\S+', ' URL ', text)

    # Replace numbers with special token
    text = re.sub(r'\b\d+(\.\d+)?\b', ' NUMBER ', text)

    # Replace punctuation with special token
    punct_pattern = f"[{re.escape(string.punctuation)}]"
    text = re.sub(punct_pattern, ' PUNCT ', text)

    # Tokenize (split by spaces)
    tokens = text.split()

    return " ".join(tokens)

In [4]:
df["Preprocessed"] = df["Sentence"].apply(preprocess)
print(df[["Sentence", "Preprocessed", "Label"]])

                                           Sentence  \
0      Check out https://example.com for more info!   
1       Order 3 items, get 1 free! Limited offer!!!   
2         Your package #12345 will arrive tomorrow.   
3         Win $1000 now, visit http://winbig.com!!!   
4  Meeting at 3pm, don't forget to bring the files.   
5      Exclusive deal for you: buy 2, get 1 free!!!   
6     Download the report from https://reports.com.   
7            The meeting is starting in 10 minutes.   
8     Reminder: submit your timesheet by 5pm today.   

                                        Preprocessed     Label  
0                  check out URL for more info PUNCT    Inform  
1  order NUMBER items PUNCT get NUMBER free PUNCT...     Promo  
2  your package PUNCT NUMBER will arrive tomorrow...    Inform  
3               win PUNCT NUMBER now PUNCT visit URL     Promo  
4  meeting at 3pm PUNCT don PUNCT t forget to bri...  Reminder  
5  exclusive deal for you PUNCT buy NUMBER PUNCT ...     Pr

In [5]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df["Preprocessed"])

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=vectorizer.get_feature_names_out()
)

print("\nTF-IDF Feature Matrix:\n")
print(tfidf_df.round(3))


TF-IDF Feature Matrix:

     3pm    5pm  arrive     at  bring    buy     by  check   deal    don  ...  \
0  0.000  0.000   0.000  0.000  0.000  0.000  0.000  0.429  0.000  0.000  ...   
1  0.000  0.000   0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  ...   
2  0.000  0.000   0.416  0.000  0.000  0.000  0.000  0.000  0.000  0.000  ...   
3  0.000  0.000   0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  ...   
4  0.318  0.000   0.000  0.318  0.318  0.000  0.000  0.000  0.000  0.318  ...   
5  0.000  0.000   0.000  0.000  0.000  0.289  0.000  0.000  0.289  0.000  ...   
6  0.000  0.000   0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  ...   
7  0.000  0.000   0.000  0.000  0.000  0.000  0.000  0.000  0.000  0.000  ...   
8  0.000  0.367   0.000  0.000  0.000  0.000  0.367  0.000  0.000  0.000  ...   

   timesheet     to  today  tomorrow    url  visit   will    win    you   your  
0      0.000  0.000  0.000     0.000  0.315  0.000  0.000  0.000  0.000  0.000  
1 

In [14]:
def extract_features(text):
    return {
        "has_url": int("URL" in text),
        "has_number": int("NUMBER" in text),
        "has_punct": int("PUNCT" in text)
    }

binary_feats = df["Preprocessed"].apply(extract_features)
binary_df = pd.DataFrame(list(binary_feats))
df = df.loc[:, ~df.columns.duplicated()]
df = pd.concat([df, binary_df], axis=1)
df = df.loc[:, ~df.columns.duplicated()]
df.head()

Unnamed: 0,Sentence,Label,Preprocessed,has_url,has_number,has_punct
0,Check out https://example.com for more info!,Inform,check out URL for more info PUNCT,0,0,0
1,"Order 3 items, get 1 free! Limited offer!!!",Promo,order NUMBER items PUNCT get NUMBER free PUNCT...,0,0,0
2,Your package #12345 will arrive tomorrow.,Inform,your package PUNCT NUMBER will arrive tomorrow...,0,0,0
3,"Win $1000 now, visit http://winbig.com!!!",Promo,win PUNCT NUMBER now PUNCT visit URL,0,0,0
4,"Meeting at 3pm, don't forget to bring the files.",Reminder,meeting at 3pm PUNCT don PUNCT t forget to bri...,0,0,0


In [17]:
def get_bigrams(tokens):
    return [(tokens[i], tokens[i+1]) for i in range(len(tokens)-1)]

# Build bigram counts per label
bigram_counts = defaultdict(Counter)
unigram_counts = defaultdict(Counter)
labels = df["Label"].unique()

In [18]:
for _, row in df.iterrows():
    tokens = row["Preprocessed"].split()
    label = row["Label"]
    for bg in get_bigrams(tokens):
        bigram_counts[label][bg] += 1
    for token in tokens:
        unigram_counts[label][token] += 1

In [19]:
vocab = set(token for label in labels for token in unigram_counts[label])
V = len(vocab)
K = 0.3

In [20]:
test_sentence = "You will get an exclusive offer in the meeting!"
test_prep = preprocess(test_sentence)
tokens = test_prep.split()
print("Preprocessed Test:", tokens)


Preprocessed Test: ['you', 'will', 'get', 'an', 'exclusive', 'offer', 'in', 'the', 'meeting', 'PUNCT']


In [25]:
def bigram_backoff_prob(tokens, label, K=0.3):
    """
    Compute bigram probability with unigram backoff.
    """
    prob = 1.0
    V = len(vocab)
    
    for i in range(len(tokens)):
        if i == 0:
            # First token: use unigram probability
            count_uni = unigram_counts[label][tokens[i]]
            total_uni = sum(unigram_counts[label].values())
            p = (count_uni + K) / (total_uni + K * V)
        else:
            bigram = (tokens[i-1], tokens[i])
            count_bg = bigram_counts[label][bigram]
            count_prev = unigram_counts[label][tokens[i-1]]
            
            if count_bg > 0:
                # Seen bigram
                p = (count_bg + K) / (count_prev + K * V)
            else:
                # Backoff to unigram
                count_uni = unigram_counts[label][tokens[i]]
                total_uni = sum(unigram_counts[label].values())
                p = (count_uni + K) / (total_uni + K * V)
                
        prob *= p
    return prob


In [26]:
label_priors = df["Label"].value_counts(normalize=True).to_dict()

In [27]:
def predict_label_backoff(tokens):
    scores = {}
    for label in labels:
        score = np.log(label_priors[label])
        score += np.log(bigram_backoff_prob(tokens, label))
        scores[label] = score
    return max(scores, key=scores.get), scores

In [None]:
test_sentence = "You will get an exclusive offer in the meeting!"
test_prep = preprocess(test_sentence)
tokens = test_prep.split()

In [29]:
pred_label, label_scores = predict_label_backoff(tokens)
print("Preprocessed Test:", tokens)
print("Predicted Label:", pred_label)
print("Label Scores:", label_scores)

Preprocessed Test: ['you', 'will', 'get', 'an', 'exclusive', 'offer', 'in', 'the', 'meeting', 'PUNCT']
Predicted Label: Promo
Label Scores: {'Inform': np.float64(-43.27516779107038), 'Promo': np.float64(-41.84566543246728), 'Reminder': np.float64(-42.13943206335457)}
