In [None]:
from IPython.core.display import display_html
display_html("<style>.container { width:100% !important; }</style>")

# Lab | Natural Language Processing
### SMS: SPAM or HAM

### Let's prepare the environment

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

- Read Data for the Fraudulent Email Kaggle Challenge
- Reduce the training set to speead up development. 

In [None]:
## Read Data for the Fraudulent Email Kaggle Challenge
data_train = pd.read_csv("data/kg_train.csv",encoding='latin-1')
data_test = pd.read_csv("data/kg_test.csv",encoding='latin-1')

# Reduce the training set to speed up development. 
# Modify for final system
print(data_train.shape)
data_train.fillna("",inplace=True)

### Let's divide the training and test set into two partitions

In [None]:
x_train = data_train.loc[:, "text"]
y_train = data_train.loc[:, "label"]

## Data Preprocessing

In [None]:
import string
from nltk.corpus import stopwords
print(string.punctuation)
print(stopwords.words("english")[100:110])
from nltk.stem.snowball import SnowballStemmer
snowball = SnowballStemmer('english')

## Now, we have to clean the html code removing words

- First we remove inline JavaScript/CSS
- Then we remove html comments. This has to be done before removing regular tags since comments can contain '>' characters
- Next we can remove the remaining tags

In [None]:
def clean_html_column(series):
    import re
    # Create a copy of the original column to work with
    cleaned_series = series.astype(str).copy()

    # 1. Remove inline JavaScript/CSS (script and style tags and their content)
    # The regex removes <script>...</script> and <style>...</style> content.
    js_css_pattern = r'<(script|style)\b[^>]*>.*?<\/\1>'
    cleaned_series = cleaned_series.str.replace(js_css_pattern, '', regex=True, flags=re.IGNORECASE | re.DOTALL)

    # 2. Remove HTML comments
    # This must be done before general tag removal since comments can contain '>'
    comment_pattern = r'<!--.*?-->'
    cleaned_series = cleaned_series.str.replace(comment_pattern, '', regex=True)

    # 3. Remove all remaining HTML tags
    # This is a general, non-greedy pattern to remove any remaining tags.
    tag_pattern = r'<[^>]+>'
    cleaned_series = cleaned_series.str.replace(tag_pattern, '', regex=True)

    return cleaned_series

x_train = clean_html_column(x_train)

- Remove all the special characters
    
- Remove numbers
    
- Remove all single characters
 
- Remove single characters from the start

- Substitute multiple spaces with single space

- Remove prefixed 'b'

- Convert to Lowercase

In [None]:
def normalize_text_series(text_series: pd.Series) -> pd.Series:
    """
    Performs multiple text cleaning and normalization steps on a Pandas Series:
    1. Converts text to lowercase.
    2. Removes prefixed 'b' (a common artifact from Python 2 bytes or serialization).
    3. Removes special characters.
    4. Removes numbers.
    5. Removes single characters.
    6. Removes single characters from the start of the text.
    7. Substitutes multiple spaces with a single space.

    Args:
        text_series (pd.Series): The Series containing the text to normalize.

    Returns:
        pd.Series: A new Series with the normalized text.
    """
    
    # Ensure text is string type for .str operations
    normalized_series = text_series.astype(str).copy()

    # 1. Convert to Lowercase
    normalized_series = normalized_series.str.lower()
    
    # 2. Remove prefixed 'b' (e.g., b'text')
    # Using a regex to handle 'b' at the start of the string, optionally with quotes
    normalized_series = normalized_series.str.replace(r'^\s*b[\'"]?', '', regex=True)

    # 3. Remove all the special characters
    # This keeps only alphanumeric characters and spaces
    normalized_series = normalized_series.str.replace(r'[^a-z0-9\s]', ' ', regex=True)
    
    # 4. Remove numbers (Optional: only remove numbers after special chars are replaced)
    normalized_series = normalized_series.str.replace(r'\d+', ' ', regex=True)

    # 5. Remove all single characters (anywhere in the string)
    # Matches any single character surrounded by spaces (e.g., ' a ')
    normalized_series = normalized_series.str.replace(r'\s+[a-z]\s+', ' ', regex=True)
    
    # 6. Remove single characters from the start
    # Matches a single character followed by a space at the start of the string
    normalized_series = normalized_series.str.replace(r'^\s*[a-z]\s+', ' ', regex=True)
    
    # 7. Substitute multiple spaces with single space AND trim leading/trailing spaces
    # Trim leading/trailing whitespace first
    normalized_series = normalized_series.str.strip() 
    # Replace multiple spaces with single space
    normalized_series = normalized_series.str.replace(r'\s+', ' ', regex=True)

    return normalized_series

x_train = normalize_text_series(x_train)

## Now let's work on removing stopwords
Remove the stopwords.

In [None]:
STOPWORDS = stopwords.words("english")
def remove_stopwords(text):
    words = [word for word in text.split() if word not in STOPWORDS]
    return " ".join(words)

x_train = x_train.apply(remove_stopwords)

## Tame Your Text with Lemmatization
Break sentences into words, then use lemmatization to reduce them to their base form (e.g., "running" becomes "run"). See how this creates cleaner data for analysis!

In [None]:
def lemmatize(text):
    snowball = SnowballStemmer("english")
    words = [snowball.stem(word) for word in text.split()]
    return " ".join(words)

x_train = x_train.apply(lemmatize)

## Bag Of Words
Let's get the 10 top words in ham and spam messages (**EXPLORATORY DATA ANALYSIS**)

In [None]:
from collections import Counter

ham = x_train[y_train==0]
spam = x_train[y_train==1]

for message_type, messages in zip(("spam", "ham"), (spam, ham)):
    print(f"In {message_type} messages the most common 10 words and they frequencies are")
    for word, freq in Counter(" ".join(list(messages)).split()).most_common(10):
        print(f"{word:10s}\t{freq}")
    print()

## Extra features

In [None]:
# We add to the original dataframe two additional indicators (money symbols and suspicious words).
money_simbol_list = "|".join(["euro","dollar","pound","â‚¬",r"\$"])
suspicious_words = "|".join(["free","cheap","sex","money","account","bank","fund","transfer","transaction","win","deposit","password"])

from sklearn.model_selection import train_test_split

x_train.name = "preprocessed_text"
data = pd.DataFrame({"preprocessed_text": x_train, "label": y_train})

In [None]:
data_train, data_val, y_train, y_val = train_test_split(data.drop(columns="label"), data.loc[:, "label"])

In [None]:
data_train['money_mark'] = data_train['preprocessed_text'].str.contains(money_simbol_list)*1
data_train['suspicious_words'] = data_train['preprocessed_text'].str.contains(suspicious_words)*1
data_train['text_len'] = data_train['preprocessed_text'].apply(lambda x: len(x)) 

data_val['money_mark'] = data_val['preprocessed_text'].str.contains(money_simbol_list)*1
data_val['suspicious_words'] = data_val['preprocessed_text'].str.contains(suspicious_words)*1
data_val['text_len'] = data_val['preprocessed_text'].apply(lambda x: len(x)) 

data_train.head()

## How would work the Bag of Words with Count Vectorizer concept?

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
words = vectorizer.fit_transform(data_train["preprocessed_text"])
print(vectorizer.get_feature_names_out())
print(words.toarray())

## TF-IDF

- Load the vectorizer

- Vectorize all dataset

- print the shape of the vetorized dataset

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
x_tfidf = tfidf.fit_transform(data["preprocessed_text"])
print(x_tfidf.shape)

## And the Train a Classifier?

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(
    x_tfidf,
    data["label"],
    stratify=data["label"],
)

model = MultinomialNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Classification report:\n{classification_report(y_test, y_pred)}")

### Extra Task - Implement a SPAM/HAM classifier

https://www.kaggle.com/t/b384e34013d54d238490103bc3c360ce

The classifier can not be changed!!! It must be the MultinimialNB with default parameters!

Your task is to **find the most relevant features**.

For example, you can test the following options and check which of them performs better:
- Using "Bag of Words" only
- Using "TF-IDF" only
- Bag of Words + extra flags (money_mark, suspicious_words, text_len)
- TF-IDF + extra flags


You can work with teams of two persons (recommended).

In [None]:
# Your code