In [90]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/email-classification-nlp/SMS_train.csv
/kaggle/input/email-classification-nlp/SMS_test.csv


In [91]:
import pandas as pd


df = pd.read_csv('/kaggle/input/email-classification-nlp/SMS_test.csv', encoding='latin-1')


print(df.head())

   S. No.                                       Message_body Label
0       1  UpgrdCentre Orange customer, you may now claim...  Spam
1       2  Loan for any purpose £500 - £75,000. Homeowner...  Spam
2       3  Congrats! Nokia 3650 video camera phone is you...  Spam
3       4  URGENT! Your Mobile number has been awarded wi...  Spam
4       5  Someone has contacted our dating service and e...  Spam


# **1.** LOWERCASING

In [92]:
df['Message_body']=df['Message_body'].str.lower()

# **2. REMOVE HTML TAGS**

In [93]:


import re

def remove_html_tags(text):
    """
    Removes HTML tags from a string using regular expressions.

    Args:
        text (str): The input string potentially containing HTML tags.

    Returns:
        str: The string with HTML tags removed.
    """
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [94]:
df['Message_body']=df['Message_body'].apply(remove_html_tags)

# **3. REMOVE URLS**

In [95]:
def remove_url(text):
    pattern=re.compile(r'https?://\S+|www\.S+')
    return pattern.sub(r'',text)

In [96]:
df['Message_body']=df['Message_body'].apply(remove_url)

In [97]:
print(df.head())

   S. No.                                       Message_body Label
0       1  upgrdcentre orange customer, you may now claim...  Spam
1       2  loan for any purpose £500 - £75,000. homeowner...  Spam
2       3  congrats! nokia 3650 video camera phone is you...  Spam
3       4  urgent! your mobile number has been awarded wi...  Spam
4       5  someone has contacted our dating service and e...  Spam


# **3. REMOVE CHAT-WORDS/SHORT HAND NOTATIONS**

In [98]:
def chat_conversion(text):
    """
    Converts common chat acronyms/abbreviations in a string to their full forms.

    Args:
        text (str): The input text containing potential chat acronyms.

    Returns:
        str: The text with chat acronyms converted.
    """
    # Define a dictionary of chat words and their full forms
    # You can expand this dictionary with more acronyms as needed
    chat_words = {
        'LOL': 'LAUGHING OUT LOUD',
        'BRB': 'BE RIGHT BACK',
        'IMHO': 'IN MY HUMBLE OPINION',
        'ATM': 'AT THE MOMENT',
        'BTW': 'BY THE WAY',
        'FYI': 'FOR YOUR INFORMATION',
        'GTG': 'GOT TO GO',
        'IDK': 'I DON\'T KNOW',
        'IMO': 'IN MY OPINION',
        'IRL': 'IN REAL LIFE',
        'J/K': 'JUST KIDDING',
        'LMAO': 'LAUGHING MY ASS OFF',
        'ROFL': 'ROLLING ON FLOOR LAUGHING',
        'SMH': 'SHAKING MY HEAD',
        'THX': 'THANKS',
        'TTYL': 'TALK TO YOU LATER',
        'OMG': 'OH MY GOD',
        'NVM': 'NEVERMIND',
        'FWIW': 'FOR WHAT IT\'S WORTH',
        'OOTD': 'OUTFIT OF THE DAY',
        'POV': 'POINT OF VIEW',
        'YOLO': 'YOU ONLY LIVE ONCE',
        # Add more chat words here!
    }

    new_text = [] # Initialize an empty list to store processed words
    for w in text.split(): # Split the input text into words
        if w.upper() in chat_words: # Check if the uppercase version of the word is in our dictionary
            new_text.append(chat_words[w.upper()]) # If yes, append its full form
        else:
            new_text.append(w) # If no, append the original word
            
    # Join the processed words back into a single string, separated by spaces
    return " ".join(new_text)

# Example of how to use the function:
input_sentence = "IMHO he is the best. LOL this is cool BTW."
converted_sentence = chat_conversion(input_sentence)
print(f"Original: {input_sentence}")
print(f"Converted: {converted_sentence}")

input_sentence_2 = "OMG, I'll BRB, just GTG grab something. FYI, I'm feeling SMH today."
converted_sentence_2 = chat_conversion(input_sentence_2)
print(f"Original: {input_sentence_2}")
print(f"Converted: {converted_sentence_2}")

Original: IMHO he is the best. LOL this is cool BTW.
Converted: IN MY HUMBLE OPINION he is the best. LAUGHING OUT LOUD this is cool BTW.
Original: OMG, I'll BRB, just GTG grab something. FYI, I'm feeling SMH today.
Converted: OMG, I'll BRB, just GOT TO GO grab something. FYI, I'm feeling SHAKING MY HEAD today.


# **4.SPELL CHECKING**

In [99]:
from textblob import TextBlob

In [100]:
def remove_txtblob(text):
    txtblb=TextBlob(text)
    return str(txtblb.correct())

In [101]:
df['Message_body']=df['Message_body'].apply(remove_txtblob)

In [102]:
print(df.head())

   S. No.                                       Message_body Label
0       1  upgrdcentre orange customer, you may now claim...  Spam
1       2  loan for any purpose £500 - £75,000. homeowner...  Spam
2       3  congress! nikita 3650 video camera phone is yo...  Spam
3       4  urgent! your mobile number has been awarded wi...  Spam
4       5  someone has contracted our dating service and ...  Spam


# **5. Remove Punctuations**

In [103]:
import re
import string

def remove_punctuations(text):
    """
    Removes punctuation from a string.

    Args:
        text (str): The input string from which to remove punctuations.

    Returns:
        str: The string with punctuations removed.
    """
   
    if not isinstance(text, str):
        return text 
   
    return re.sub(r'[^\w\s]', '', text)




In [104]:
df['Message_body']=df['Message_body'].apply(remove_punctuations)