<a href="https://colab.research.google.com/github/thanujamaddika/NLP/blob/main/AD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Importing necessary libraries from the NLTK toolkit
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize  # For tokenizing text into words and sentences

# Importing stopwords from NLTK to remove common words that add little value
from nltk.corpus import stopwords

# Downloading required NLTK datasets
nltk.download('punkt')  # Tokenizer models for sentence and word tokenization
nltk.download('punkt_tab')  # Optional: Extra support for tokenization
nltk.download('stopwords')  # Predefined stopword lists for various languages

# Downloading the dataset from Kaggle using Kaggle CLI
!kaggle datasets download -d abdallahwagih/spam-emails  # Dataset containing spam emails
!unzip spam-emails.zip  # Extracting the downloaded dataset

# Importing pandas for working with data in tabular format
import pandas as pd

# Loading the CSV dataset into a DataFrame
df = pd.read_csv("spam.csv")  # CSV contains columns like 'Message' and labels indicating spam or not

# Step to clean the text data:
# - Removing punctuation, special characters, and multiple spaces
# - Preparing data for tokenization and further text processing

import re  # Regular expressions for text cleaning

cleaned = []  # List to store cleaned text
for text in df['Message']:  # Looping through each message in the 'Message' column
    cleaned_text = re.sub(r'[^\w\s]', '', text)  # Removing all characters except words and spaces
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replacing multiple spaces with a single space
    cleaned_data = cleaned_text.strip()  # Stripping leading and trailing whitespace
    cleaned.append(cleaned_data)  # Adding the cleaned text to the list

# Tokenizing the cleaned text into words
# This step splits each cleaned text into a list of words
tokens = [word_tokenize(x) for x in cleaned]

# Removing stopwords from tokenized words
# Stopwords are commonly used words like "is", "the", "and", etc., which are removed to reduce noise
stop = set(stopwords.words('english'))  # Fetching the list of English stopwords
stpktn = []  # List to store stopword-removed tokens
for k in range(len(df['Message'])):  # Loop through the tokenized text
    p = [i for i in tokens[k] if i not in stop]  # Filter out tokens that are in the stopword list
    stpktn.append(p)  # Append the filtered tokens to the list

# Summary of steps:
# 1. Dataset is downloaded and loaded into a pandas DataFrame.
# 2. Text messages are cleaned by removing punctuation, special characters, and extra spaces.
# 3. The cleaned text is tokenized into words.
# 4. Stopwords are removed to focus on meaningful words.

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Dataset URL: https://www.kaggle.com/datasets/abdallahwagih/spam-emails
License(s): apache-2.0
Downloading spam-emails.zip to /content
  0% 0.00/207k [00:00<?, ?B/s]
100% 207k/207k [00:00<00:00, 51.8MB/s]
Archive:  spam-emails.zip
  inflating: spam.csv                


In [None]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [None]:
ps.stem('studied')

'studi'

In [None]:
[ps.stem(word) for word in stpktn[0]]

['go',
 'jurong',
 'point',
 'crazi',
 'avail',
 'bugi',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amor',
 'wat']

In [None]:
stemed_data=[]
for message in stpktn:
    stm=[ps.stem(word) for word in message]
    stemed_data.append(stm)

In [None]:
stemed_data

[['go',
  'jurong',
  'point',
  'crazi',
  'avail',
  'bugi',
  'n',
  'great',
  'world',
  'la',
  'e',
  'buffet',
  'cine',
  'got',
  'amor',
  'wat'],
 ['ok', 'lar', 'joke', 'wif', 'u', 'oni'],
 ['free',
  'entri',
  '2',
  'wkli',
  'comp',
  'win',
  'fa',
  'cup',
  'final',
  'tkt',
  '21st',
  'may',
  '2005',
  'text',
  'fa',
  '87121',
  'receiv',
  'entri',
  'questionstd',
  'txt',
  'ratetc',
  'appli',
  '08452810075over18'],
 ['u', 'dun', 'say', 'earli', 'hor', 'u', 'c', 'alreadi', 'say'],
 ['nah', 'i', 'dont', 'think', 'goe', 'usf', 'live', 'around', 'though'],
 ['freemsg',
  'hey',
  'darl',
  '3',
  'week',
  'word',
  'back',
  'id',
  'like',
  'fun',
  'still',
  'tb',
  'ok',
  'xxx',
  'std',
  'chg',
  'send',
  '150',
  'rcv'],
 ['even',
  'brother',
  'like',
  'speak',
  'they',
  'treat',
  'like',
  'aid',
  'patent'],
 ['as',
  'per',
  'request',
  'mell',
  'mell',
  'oru',
  'minnaminungint',
  'nurungu',
  'vettam',
  'set',
  'callertun',
  'call

In [None]:
#lets apply pos_tags
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [None]:
pos_tag(stemed_data[0])

[('go', 'VB'),
 ('jurong', 'JJ'),
 ('point', 'NN'),
 ('crazi', 'NN'),
 ('avail', 'NN'),
 ('bugi', 'NN'),
 ('n', 'RB'),
 ('great', 'JJ'),
 ('world', 'NN'),
 ('la', 'NN'),
 ('e', 'VBP'),
 ('buffet', 'JJ'),
 ('cine', 'NN'),
 ('got', 'VBD'),
 ('amor', 'JJ'),
 ('wat', 'NN')]

In [None]:
pos_token = [pos_tag(message) for message in stemed_data]

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
lm=WordNetLemmatizer()

In [None]:
lm.lemmatize("studied", 'v')

'study'

In [None]:
pos_token[0]

[('go', 'VB'),
 ('jurong', 'JJ'),
 ('point', 'NN'),
 ('crazi', 'NN'),
 ('avail', 'NN'),
 ('bugi', 'NN'),
 ('n', 'RB'),
 ('great', 'JJ'),
 ('world', 'NN'),
 ('la', 'NN'),
 ('e', 'VBP'),
 ('buffet', 'JJ'),
 ('cine', 'NN'),
 ('got', 'VBD'),
 ('amor', 'JJ'),
 ('wat', 'NN')]

In [None]:
# Function to convert nltk's POS tags to WordNet's POS tags
#import wordnet
from nltk.corpus import wordnet
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ  # Adjective
    elif tag.startswith('V'):
        return wordnet.VERB  # Verb
    elif tag.startswith('N'):
        return wordnet.NOUN  # Noun
    elif tag.startswith('R'):
        return wordnet.ADV  # Adverb
    else:
        return wordnet.NOUN  # Default to noun if

In [None]:
pos_token[0][0][1]

'VB'

In [None]:
get_wordnet_pos('VB')


'v'

In [None]:
get_wordnet_pos(pos_token[0][0][1])

'v'

In [None]:
lm.lemmatize(pos_token[0][0][0],get_wordnet_pos(pos_token[0][0][1]))

'go'

In [None]:
pos_token[3][2][0]

'say'

In [None]:
lemed_data=[]
for i in range(len(pos_token)):
  lem=[lm.lemmatize(word[0],get_wordnet_pos(word[1])) for word in pos_token[i]]
  lemed_data.append(lem)

In [None]:
lemed_data=[]
for message in pos_token:
    lem=[]
    for word in message:
        lem.append(lm.lemmatize(word[0],get_wordnet_pos(word[1])))
    lemed_data.append(lem)

In [None]:
#applying countvectorization
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()

In [None]:
stemed_data[0]

['go',
 'jurong',
 'point',
 'crazi',
 'avail',
 'bugi',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amor',
 'wat']

In [None]:
[' '.join(message) for message in stemed_data]

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri questionstd txt ratetc appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah i dont think goe usf live around though',
 'freemsg hey darl 3 week word back id like fun still tb ok xxx std chg send 150 rcv',
 'even brother like speak they treat like aid patent',
 'as per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner as valu network custom select receivea 900 prize reward to claim call 09061701461 claim code kl341 valid 12 hour',
 'had mobil 11 month u r entitl updat latest colour mobil camera free call the mobil updat co free 08002986030',
 'im gon na home soon dont want talk stuff anymor tonight k ive cri enough today',
 'six chanc win cash from 100 20000 pound txt csh11 send 87575 cost 150pday 6day 16 tsandc ap

In [None]:
stem_vec=[' '.join(message) for message in stemed_data]

In [None]:
stem_vec

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri 2 wkli comp win fa cup final tkt 21st may 2005 text fa 87121 receiv entri questionstd txt ratetc appli 08452810075over18',
 'u dun say earli hor u c alreadi say',
 'nah i dont think goe usf live around though',
 'freemsg hey darl 3 week word back id like fun still tb ok xxx std chg send 150 rcv',
 'even brother like speak they treat like aid patent',
 'as per request mell mell oru minnaminungint nurungu vettam set callertun caller press 9 copi friend callertun',
 'winner as valu network custom select receivea 900 prize reward to claim call 09061701461 claim code kl341 valid 12 hour',
 'had mobil 11 month u r entitl updat latest colour mobil camera free call the mobil updat co free 08002986030',
 'im gon na home soon dont want talk stuff anymor tonight k ive cri enough today',
 'six chanc win cash from 100 20000 pound txt csh11 send 87575 cost 150pday 6day 16 tsandc ap

In [None]:
cv.fit_transform(stem_vec)

<5572x8153 sparse matrix of type '<class 'numpy.int64'>'
	with 51383 stored elements in Compressed Sparse Row format>

In [None]:
x_vec = cv.fit_transform(stem_vec).toarray()

In [None]:
x_vec

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
x_vec[0]

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
len(x_vec[0])

8153

In [None]:
#import multinominalnb
from sklearn.naive_bayes import MultinomialNB


In [None]:
# Assuming your target variable column is named 'Category' or 'v1' based on your CSV
y = df['Category']  # Or y = df['v1'] if that's the correct column name

In [None]:
y

Unnamed: 0,Category
0,ham
1,ham
2,spam
3,ham
4,ham
...,...
5567,spam
5568,ham
5569,ham
5570,ham


In [None]:
mb = MultinomialNB()

In [None]:
#data fitted in ml
mb.fit(x_vec,y)


In [None]:
x_vec[0]

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
df['Message'][0]

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [None]:
mb.predict([x_vec[0]])

array(['ham'], dtype='<U4')

In [None]:
#1-DO TRAIN TEST &SPLIT
#create a logistic regression model
a=mb.predict(x_vec)

In [None]:
# 1. Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# 2. Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x_vec, y, test_size=0.25, random_state=0)

# 3. Create and train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# 4. Make predictions on the test set
predictions = model.predict(X_test)

# 5. (Optional) Evaluate the model
# Import metrics for evaluation (e.g., accuracy, precision, recall)
from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, pos_label='spam') # Assuming 'spam' is your positive class
recall = recall_score(y_test, predictions, pos_label='spam')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

Accuracy: 0.9834888729361091
Precision: 0.9879518072289156
Recall: 0.8864864864864865


In [None]:
lrg = LogisticRegression()

In [None]:
lrg

In [None]:
# Fit the logistic regression model before calculating the score
lrg.fit(x_vec, y)  # This line is added to train the model

# Now you can calculate the score
lrg.score(x_vec, y)

0.9956927494615937

In [None]:
lrg.score(X_test,y_test)

0.9971284996410624