In [39]:
# Import pandas
import pandas as pd
# Import the required dependencies from sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

# Set the column width to view the text message data.
pd.set_option('max_colwidth', 200)

# Import Gradio
import gradio as gr

# Load dataset
df = pd.read_csv("SMSSpamCollection.csv", sep="\t", names=["label", "message"])

# Print first 5 rows
print(df.head())

                                                                                                                                                              label  \
0                                                                                                                                                label,text_message   
1                                             ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."   
2                                                                                                                                 ham,Ok lar... Joking wif u oni...   
3  spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's   
4                                                                                                             ham,U dun say so early hor... U c already then say...  

In [57]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

def sms_classification(sms_text_df):
    """
    Perform SMS classification using a pipeline with TF-IDF vectorization and Linear Support Vector Classification.

    Parameters:
    - sms_text_df (pd.DataFrame): DataFrame containing 'text_message' and 'label' columns for SMS classification.

    Returns:
    - text_clf (Pipeline): Fitted pipeline model for SMS classification.

    This function takes a DataFrame with 'text_message' and 'label' columns, splits the data into
    training and testing sets, builds a pipeline with TF-IDF vectorization and Linear Support Vector
    Classification, and fits the model to the training data. 
    The fitted pipeline is returned to make future predictions.
    """
    # Set the features variable to the text message column.
    features = sms_text_df["text_message"]
    
    # Set the target variable to the "label" column.
    target = sms_text_df["label"]

    # Split data into training and testing and set the test_size = 33%
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.33, random_state=42, stratify=target)

    # Build a pipeline to transform the test set to compare to the training set.
    text_clf = Pipeline([
        ("tfidf", TfidfVectorizer()),
        ("svc", LinearSVC(dual="auto"))
    ])

    # Fit the model to the transformed training data and return model.
    text_clf.fit(X_train, y_train)
    
    return text_clf

# Define file path
file_path = "SMSSpamCollection.csv"

# Check if file exists
if not os.path.exists(file_path):
    print(f"Error: File '{file_path}' not found. Please check the file path and ensure it is in the correct directory.")
else:
    # Load dataset correctly with proper column names
    df = pd.read_csv(file_path, encoding="utf-8")

    # Rename columns to match expected format
    df.columns = ["label", "text_message"]

    # Drop any rows with missing values
    df.dropna(inplace=True)

    # Convert labels to lowercase for consistency
    df["label"] = df["label"].str.lower()

    # Check unique label counts
    label_counts = df["label"].value_counts()
    print("Unique label counts:", label_counts)

    # Display first few rows to verify content
    print(df.head())

    # Ensure dataset contains both 'ham' and 'spam'
    if len(label_counts) < 2:
        print("Error: The dataset contains only one class. Please check if the dataset is loaded correctly.")
        print("Possible issues:")
        print("- Ensure 'SMSSpamCollection.csv' is in the correct format.")
        print("- The dataset should have both 'spam' and 'ham' labels.")
        print("- Check if the file is being read correctly with the right separator.")
    else:
        # Train model
        text_clf = sms_classification(df)

        # Print model pipeline
        print(text_clf)

Unique label counts: label
ham     4825
spam     747
Name: count, dtype: int64
  label  \
0   ham   
1   ham   
2  spam   
3   ham   
4   ham   

                                                                                                                                                  text_message  
0                                              Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...  
1                                                                                                                                Ok lar... Joking wif u oni...  
2  Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's  
3                                                                                                            U dun say so early hor... U c already then say...  
4                                                

In [59]:
# Load the dataset into a DataFrame
df = pd.read_csv("SMSSpamCollection.csv", encoding="utf-8")

# Rename columns to match expected format
df.columns = ["label", "text_message"]

# Display first few rows to verify content
print(df.head())

  label  \
0   ham   
1   ham   
2  spam   
3   ham   
4   ham   

                                                                                                                                                  text_message  
0                                              Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...  
1                                                                                                                                Ok lar... Joking wif u oni...  
2  Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's  
3                                                                                                            U dun say so early hor... U c already then say...  
4                                                                                                Nah I don't think he goes to us

In [61]:
# Call the sms_classification function with the DataFrame and set the result to the "text_clf" variable
if len(df["label"].value_counts()) < 2:
    print("Error: The dataset contains only one class. Cannot train the model.")
else:
    text_clf = sms_classification(df)
    print("Model training completed successfully.")
    print(text_clf)


Model training completed successfully.
Pipeline(steps=[('tfidf', TfidfVectorizer()), ('svc', LinearSVC(dual='auto'))])


In [55]:
# Create a function called `sms_prediction` that takes in the SMS text and predicts the whether the text is "not spam" or "spam". 
# The function should return the SMS message, and say whether the text is "not spam" or "spam".
def sms_prediction(text):
    """
    Predict the spam/ham classification of a given text message using a pre-trained model.

    Parameters:
    - text (str): The text message to be classified.

    Returns:
    - str: A message indicating whether the text message is classified as spam or not.

    This function takes a text message and a pre-trained pipeline model, then predicts the
    spam/ham classification of the text. The result is a message stating whether the text is
    classified as spam or not.
    """
    # Create a variable that will hold the prediction of a new text.
    
    # Using a conditional if the prediction is "ham" return the message:
    # f'The text message: "{text}", is not spam.' Else, return f'The text message: "{text}", is spam.'
    

Unique label counts: label
ham     4825
spam     747
Name: count, dtype: int64
  label  \
0   ham   
1   ham   
2  spam   
3   ham   
4   ham   

                                                                                                                                                  text_message  
0                                              Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...  
1                                                                                                                                Ok lar... Joking wif u oni...  
2  Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's  
3                                                                                                            U dun say so early hor... U c already then say...  
4                                                

In [63]:
import gradio as gr

# Define prediction function
def sms_prediction(text):
    prediction = text_clf.predict([text])[0]
    return f"The text message: '{text}' is classified as '{prediction}'."

# Create the Gradio interface
sms_app = gr.Interface(
    fn=sms_prediction,
    inputs=gr.Textbox(label="Enter SMS Text"),
    outputs=gr.Textbox(label="Prediction"),
    title="SMS Spam Detector",
    description="Enter a text message to check if it is classified as spam or ham."
)

# Launch the app
sms_app.launch()


* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




## Test the following text messages. 

---

1. You are a lucky winner of $5000!
2. You won 2 free tickets to the Super Bowl.
3. You won 2 free tickets to the Super Bowl text us to claim your prize.
4. Thanks for registering. Text 4343 to receive free updates on medicare.