In [1]:
# Import pandas
import pandas as pd
# Import the required dependencies from sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

# Set the column width to view the text message data.
pd.set_option('max_colwidth', 200)

# Import Gradio
import gradio as gr

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def sms_classification(sms_text_df):
    """
    Perform SMS classification using a pipeline with TF-IDF vectorization and Linear Support Vector Classification.

    Parameters:
    - sms_text_df (pd.DataFrame): DataFrame containing 'text_message' and 'label' columns for SMS classification.

    Returns:
    - text_clf (Pipeline): Fitted pipeline model for SMS classification.

    This function takes a DataFrame with 'text_message' and 'label' columns, splits the data into
    training and testing sets, builds a pipeline with TF-IDF vectorization and Linear Support Vector
    Classification, and fits the model to the training data. 
    The fitted pipeline is returned to make future predictions.
    """
   # Set X features variable to text message; Set y target variable to "label" column. 
    X = sms_text_df['text_message']  
    y = sms_text_df['label']
   
    # Split data into training and testing; set the test_size = 33%
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    # Build pipeline to transform the test set to compare to the training set.
    text_clf = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                     ('clf', LinearSVC()),
    ])

    # Fit the model to the transformed data.
    text_clf.fit(X_train, y_train)  

    # Fit the model to the transformed training data and return model.
    return text_clf

In [3]:
# Load the dataset into a DataFrame
sms_text_df = pd.read_csv('Resources/SMSSpamCollection.csv')
sms_text_df.head()

Unnamed: 0,label,text_message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [4]:
# Call the sms_classification function with the DataFrame and set the result to the "text_clf" variable
# sms_classification returns a fitted pipeline model for SMS classification
text_clf = sms_classification(sms_text_df)


In [5]:
# Create a function called `sms_prediction` that takes in the SMS text and predicts the whether the text is "not spam" or "spam". 
# The function should return the SMS message, and say whether the text is "not spam" or "spam".
def sms_prediction(text):
    """
    Predict the spam/ham classification of a given text message using a pre-trained model.

    Parameters:
    - text (str): The text message to be classified.

    Returns:
    - str: A message indicating whether the text message is classified as spam or not.

    This function takes a text message and a pre-trained pipeline model, then predicts the
    spam/ham classification of the text. The result is a message stating whether the text is
    classified as spam or not.
    """
    # Create a variable that will hold the prediction of a new text.

    #  NOTE: The predict method of the Pipeline (and many other scikit-learn models) expects 
    #   an iterable (eg. list, array) as it is designed to handle batch predictions. 
    #   It can process multiple text documents at once, even if you’re only passing a single text message.
    #   The method is designed to handle both single & multiple inputs in a consistent manner. 
    #   By always expecting an iterable, the method can uniformly process the input data.
    #   Batch Processing: ML models often make predictions on multiple samples at once;
    #     Expecting an iterable allows the model to efficiently handle batch predictions.
    #   Data Transformation: The pipeline includes steps like TfidfVectorizer, which 
    #     are designed to transform a collection of text documents into numerical features.  
    
    prediction = text_clf.predict([text])
    
    #  text message is wrapped in a list when passed to the predict method.
    #  prediction result is accessed using prediction[0] to get the actual label.
    
    # Use a conditional to give return, based on prediction
    if prediction[0] == 'ham':
        return f'The text message: "{text}", is not spam.' 
    else:
        return f'The text message: "{text}", is spam.'
    

In [6]:
# For testing responses.
spam_texts = [
'Congratulations! Youve won a $1,000 gift card. Click here to claim your prize www.urldays.com',
'URGENT: Your NetFlix account has been suspended. Verify your information here: www.netflix.corm/useracct',
'You have a package waiting for you. Confirm delivery details: www.uspschecker.org',
'Get a free iPhone now! Limited time offer, click here www.shorturl.com',
'You’ve been selected for a special offer. Reply with your bank details to claim.']

non_spam_texts = [
'Hey just checking in. How you guys doing?',
'Dont forget about the meeting tomorrow at 10 AM.',
'Happy Birthday! Hope you have a fantastic day!',
'Can you grab some groceries on ur way home?',
'Sup! Let’s grab lunch this weekend. U free Saturday?']

non_spam_genz_texts = [
'omg kings of leon show was lit. cant believe we saw them live. 🔥',
'Hey fam, wanna grab some brunch this weekend? lmk if ur down. 🥞',
'finished my workout.  feelin swole rn. no cap!',
'becoming ann on Netflix is a total vibe- Binge-watched whole thing lass night 😂',
'u hear latest song from WKND? so on point, yo']

small_spam_texts_sample = [
    "Congratulations! You've won a $1,000 gift card. Click here to claim your prize: [Link]",
    "URGENT: Your account has been suspended. Verify your information here: [Link]"
]

In [7]:
# Testing responses.
for msg in non_spam_genz_texts:
    print(sms_prediction(msg))

The text message: "omg kings of leon show was lit. cant believe we saw them live. 🔥", is not spam.
The text message: "Hey fam, wanna grab some brunch this weekend? lmk if ur down. 🥞", is not spam.
The text message: "finished my workout.  feelin swole rn. no cap!", is not spam.
The text message: "becoming ann on Netflix is a total vibe- Binge-watched whole thing lass night 😂", is not spam.
The text message: "u hear latest song from WKND? so on point, yo", is not spam.


In [11]:
# Create a sms_app that takes a textbox for the inputs and has a textbox for the output.  
# Povide labels for each textbox. 

# Create the Gradio interface
iface = gr.Interface(
    fn=sms_prediction,
    inputs=gr.Textbox(lines=2, label="Input Text Message:", placeholder="Enter your text message here...", ),
    outputs=gr.Textbox(label="Prediction Result:"),
    title="SMS Spam Detector",
    description="Input text message content and press 'Submit' to get a prediction on whether or not it is Spam:"
)

# Launch the app.
iface.launch(share=True)

Running on local URL:  http://127.0.0.1:7863
Running on public URL: https://8c1e24b3f196a3e570.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




## Test the following text messages. 

---

1. You are a lucky winner of $5000!
2. You won 2 free tickets to the Super Bowl.
3. You won 2 free tickets to the Super Bowl text us to claim your prize.
4. Thanks for registering. Text 4343 to receive free updates on medicare.