Case folding is a technique that we use to reduce case insensivity, normalizing text and enhace consistency in the dataset

In [1]:
txt = "Hello, And Welcome to my world"
print(txt)

Hello, And Welcome to my world


In [2]:
x = txt.casefold()
print(x)

hello, and welcome to my world


casefold is mainly lowercasing the text for training

We need to remove the special characters like $,etc for noise elimation, improve text quality and better tokenization

In [1]:
import re
#input string
input_str = "Hello how are you$!!"

#clean the string
clean_str = re.sub(r"[^a-zA-Z0-9\s]","",input_str)

In [2]:
print(clean_str)

Hello how are you


## Use libraries:
- NLTK
- SpaCy

download the model

In [4]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [None]:
import spacy
#load model
nlp = spacy.load("en_core_web_sm")

#input str
input_str = "Hello how are you$!!"

def clean_text(text):
    cleaned_text = ''.join(char for char in text if char.isalpha() or char.isspace()) #filter out the unneeded character and space
    doc = nlp(cleaned_text)
    return ' '.join(token.text for token in doc) #add the spaces to the beginning text

clean_str = clean_text(input_str)
print(clean_str)


Hello how are you


In [17]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /Users/mymac/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
input_str = "Hello how are you$!!"
# Tokenize the input
tokens = nltk.word_tokenize(input_str)
# If the token is alpha or num then we use it
clean_tokens = [token for token in tokens if token.isalnum()]
# add spaces
clean_str = " ".join(clean_tokens)

print(clean_str)

Hello how are you


Handling contractions:

Ex: isn't -> is not

We can use the contraction library for handling this contraction

In [19]:
import contractions

txt = "I can't believe that I'm the winner."

expanded_txt = contractions.fix(txt)

print(expanded_txt)

I cannot believe that I am the winner.


In [None]:
import re

def expand_contractions(text):
    contractions_pattern = {
        r"(?i)can't": "cannot",
        r"(?i)isn't": "is not",   #(?i) is for the case-sensitive, which mean it works for both lowercase and uppercase
        r"(?i)aren't": "are not",
        r"(?i)won't": "will not",
        r"(?i)weren't": "were not",
        r"(?i)I'm": "I am"
    }
    for contraction, expansion in contractions_pattern.items():
        text = re.sub(contraction,expansion,text)
    return text

txt = "I can't believe that I'm the winner."
print(expand_contractions(txt))

I cannot believe that I am the winner.


# Tokenization

In [1]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
sample = "Lam is so handsome. He is gonna be a coder."
words = word_tokenize(sample)
print(words)

['Lam', 'is', 'so', 'handsome', '.', 'He', 'is', 'gon', 'na', 'be', 'a', 'coder', '.']


In [3]:
setences = sent_tokenize(sample)
print(setences)

['Lam is so handsome.', 'He is gonna be a coder.']


In [7]:
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/mymac/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
sentence = "This is a sample sentence, showing off my technical skills."

In [11]:
stop_words = set(stopwords.words('english'))
words = word_tokenize(sentence)
new_sentence = [word for word in words if word.lower() not in stop_words]
print(new_sentence)

['sample', 'sentence', ',', 'showing', 'technical', 'skills', '.']


In [None]:
from nltk import ngrams

In [13]:
def generate_tokens(text,n):
    tokens = word_tokenize(text)
    n_grams = list(ngrams(tokens,n))
    return n_grams

In [14]:
txt = "N-Grams is a high technique that you should learn in Artificial Intelligence."
unigrams = generate_tokens(txt,1)
bigrams = generate_tokens(txt,2)
trigrams = generate_tokens(txt,3)
print(unigrams)
print(bigrams)
print(trigrams)

[('N-Grams',), ('is',), ('a',), ('high',), ('technique',), ('that',), ('you',), ('should',), ('learn',), ('in',), ('Artificial',), ('Intelligence',), ('.',)]
[('N-Grams', 'is'), ('is', 'a'), ('a', 'high'), ('high', 'technique'), ('technique', 'that'), ('that', 'you'), ('you', 'should'), ('should', 'learn'), ('learn', 'in'), ('in', 'Artificial'), ('Artificial', 'Intelligence'), ('Intelligence', '.')]
[('N-Grams', 'is', 'a'), ('is', 'a', 'high'), ('a', 'high', 'technique'), ('high', 'technique', 'that'), ('technique', 'that', 'you'), ('that', 'you', 'should'), ('you', 'should', 'learn'), ('should', 'learn', 'in'), ('learn', 'in', 'Artificial'), ('in', 'Artificial', 'Intelligence'), ('Artificial', 'Intelligence', '.')]


# Vectorization

## Bag of words

In [1]:
import nltk
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('spam.csv')

In [3]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df['Category'].value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

In [5]:
df['spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)

In [6]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [10]:
data = pd.read_csv('spam.csv')

In [11]:
data['Category'].replace({'ham': 0, 'spam':1},inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Category'].replace({'ham': 0, 'spam':1},inplace=True)
  data['Category'].replace({'ham': 0, 'spam':1},inplace=True)


In [12]:
data

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will ü b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [13]:
data.shape

(5572, 2)

In [14]:
df.shape

(5572, 3)

### Train Test Split

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
x_train, x_test, y_train, y_test = train_test_split(df.Message, df.spam,test_size = 0.2)

In [17]:
x_train.shape

(4457,)

In [18]:
x_test.shape

(1115,)

### Bag of words representation using CountVectorizer 

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
v = CountVectorizer()
x_train_cv = v.fit_transform(x_train.values)

In [22]:
x_train_cv

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 59111 stored elements and shape (4457, 7670)>

In [23]:
x_train_cv.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(4457, 7670))

In [24]:
x_train_cv.shape

(4457, 7670)

In [25]:
x_train_np = x_train_cv.toarray()
x_train_np

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(4457, 7670))

In [26]:
from sklearn.naive_bayes import MultinomialNB

In [27]:
model = MultinomialNB()
model.fit(x_train_cv,y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [28]:
x_test_cv = v.transform(x_test)

In [29]:
y_pred = model.predict(x_test_cv)

In [30]:
from sklearn.metrics import classification_report

In [35]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       977
           1       0.98      0.90      0.94       138

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [36]:
message = {"Exclusive Offer. Meal to one"}
message_cnt = v.transform(message)

In [37]:
model.predict(message_cnt)

array([0])