In [None]:
import kaggle
import pandas as pd
pd.set_option('display.max_colwidth', None)


import matplotlib.pyplot as plt
import seaborn as sns
import re


import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords,wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

In [None]:
downloads = [
    'punkt',
    'punkt_tab',
    'stopwords',
    'wordnet',
    'omw-1.4',
    'averaged_perceptron_tagger_eng'
]
for item in downloads:
    nltk.download(item)

In [None]:
# kaggle.api.authenticate()
# kaggle.api.dataset_download_files('team-ai/spam-text-message-classification',unzip=True,path='./data')

In [None]:
df = pd.read_csv('./data/SPAM text message 20170820 - Data.csv')

In [None]:
df

In [None]:
df.info()

In [None]:
df.Category.value_counts()

In [None]:
counts = df['Category'].value_counts()
ax = sns.barplot(x=counts.index, y=counts.values)

total = counts.sum()
for i, v in enumerate(counts.values):
    ax.text(i, v, f"{v/total:.1%}", ha='center', va='bottom')
plt.tight_layout()
plt.show()

In [None]:
# LabelEncoding the category
df['Category'] = df['Category'].map({'ham': 0, 'spam': 1})


In [None]:
# Checking for Emoji as in this case (Sentiment analysis) it important
emoji_pattern = re.compile(
    "["
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F680-\U0001F6FF"  # transport & map
    "\U0001F700-\U0001F77F"
    "\U0001F780-\U0001F7FF"
    "\U0001F800-\U0001F8FF"
    "\U0001F900-\U0001F9FF"
    "\U0001FA00-\U0001FAFF"
    "]+",
    flags=re.UNICODE
)

df["Message"].apply(lambda x: bool(emoji_pattern.search(str(x)))).value_counts()


In [None]:
# No Emoji is present

In [None]:
# Function to clean text and normalizing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)        # HTML
    text = re.sub(r'\d+', '', text)          # numbers
    text = re.sub(r'[^\w\s]', '', text)      # punctuation
    text = re.sub(r'[^\x00-\x7F]+', '', text) # emojis
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
# Removing the punctuation mark and lowercasing the msg
df['text'] = df['Message'].apply(clean_text)
df.drop(columns='Message',inplace=True)

In [None]:
#Tokenization
df['text'] = df['text'].apply(word_tokenize)

In [None]:
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(
    lambda x: [w for w in x if w.lower() not in stop_words]
)
df

In [None]:

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_with_pos(tokens):
    pos_tags = pos_tag(tokens)
    return [
        lemmatizer.lemmatize(word, get_wordnet_pos(tag))
        for word, tag in pos_tags
    ]

df['text'] = df['text'].apply(lemmatize_with_pos)
df


In [None]:
df['text'] = df['text'].apply(lambda x : ' '.join(x))
df