In [1]:
import pandas as pd

In [2]:
messages = pd.read_csv(r"C:\Users\vikas\OneDrive\Desktop\Machine Learning\NLP FOR MACHINE LEARNING\spam.csv", encoding='ISO-8859-1')

In [3]:
messages

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [4]:
messages.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [5]:
messages.drop(columns=['Unnamed: 2'], inplace=True)

In [6]:
messages.drop(columns=['Unnamed: 3'], inplace=True)

In [7]:
messages.drop(columns=['Unnamed: 4'], inplace=True)

In [8]:
messages

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [9]:

messages.rename(columns={'v1': 'label', 'v2': 'message'}, inplace=True)
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


### **Data Cleaning And Preprocessing**

In [10]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

# Code Explanation

**Step 1: Initialize Corpus**
- `corpus = []`
  - Initializes an empty list called `corpus`. This list will store the processed text reviews.

**Step 2: Loop Through Messages**
- `for i in range(0, len(messages))`
  - Begins a loop that iterates over the indices of the `messages` DataFrame, from `0` to `len(messages) - 1`. The variable `i` represents the current index in the loop.

**Step 3: Clean the Message Text**
- `review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])`
  - Uses a regular expression (`re.sub`) to clean the text of the current message. It replaces any character that is not a letter (a-z or A-Z) with a space. `messages['message'][i]` retrieves the message text at index `i`.

**Step 4: Convert to Lowercase**
- `review = review.lower()`
  - Converts the entire cleaned message (`review`) to lowercase. This helps standardize the text by eliminating case sensitivity.

**Step 5: Split into Words**
- `review = review.split()`
  - Splits the cleaned and lowercased text into individual words (tokens), creating a list of words from the string.

**Step 6: Remove Stopwords and Apply Stemming**
- `review = [ps.stem(word) for word in review if not word in stopwords.words('english')]`
  - This line performs two operations using a list comprehension:
    - **Stopword Removal:** Filters out common words (stopwords) that may not add meaningful value to the analysis (e.g., "the", "and", "is"). The `stopwords.words('english')` function retrieves a list of English stopwords from the NLTK library.
    - **Stemming:** Applies stemming to each word using `ps.stem(word)`, where `ps` is likely a Porter Stemmer or another stemming algorithm. Stemming reduces words to their root form (e.g., "running" becomes "run").

**Step 7: Join Words Back into String**
- `review = ' '.join(review)`
  - Joins the list of processed words back into a single string, with words separated by spaces. This results in a cleaned, tokenized version of the original message.

**Step 8: Append to Corpus**
- `corpus.append(review)`
  - Appends the processed review (cleaned and tokenized message) to the `corpus` list, accumulating all the processed messages in the `corpus`.

## Summary
- The code iterates through each message in the `messages` DataFrame.
- It cleans the text by removing non-alphabetical characters, converting it to lowercase, splitting it into words, removing stopwords, and applying stemming.
- Each processed message is added to the `corpus` list, resulting in a collection of cleaned text data that is ready for further analysis, such as training a machine learning model for text classification or sentiment analysis.


In [11]:
corpus = [ ]
for i in range(0,len(messages)):
    review = re.sub('[^a-zA-z]',' ',messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [12]:
corpus

['go jurong point crazi avail bugi n great world la e buffet cine got amor wat',
 'ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl week word back like fun still tb ok xxx std chg send rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun',
 'winner valu network custom select receivea prize reward claim call claim code kl valid hour',
 'mobil month u r entitl updat latest colour mobil camera free call mobil updat co free',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'six chanc win cash pound txt csh send cost p day day tsandc appli repli hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw',
 'search right word thank breather

### **Create Bag of Words**

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
cv = CountVectorizer(max_features=2500,binary=True)

In [15]:
X = cv.fit_transform(corpus).toarray()

In [16]:
import numpy as np
np.set_printoptions(edgeitems=300,linewidth=100000,formatter=dict(float=lambda x : "%.3g" % x))

In [17]:
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [18]:
X.shape

(5572, 2500)

# **Lemmitization**

In [19]:
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()

In [20]:
corpus = [ ]
for i in range(0,len(messages)):
    review = re.sub('[^a-zA-z]',' ',messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [lem.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [21]:
corpus

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling week word back like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner valued network customer selected receivea prize reward claim call claim code kl valid hour',
 'mobile month u r entitled update latest colour mobile camera free call mobile update co free',
 'gonna home soon want talk stuff anymore tonight k cried enough today',
 'six chance win cash pound txt csh send cost p day day tsandcs apply reply hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw'

### **Create Bag of Words**

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

In [23]:
cv = CountVectorizer(max_features=2500,binary=True)

In [24]:
X = cv.fit_transform(corpus).toarray()

In [25]:
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [26]:
X.shape

(5572, 2500)

# **N-Grams**

In [27]:
cv.vocabulary_

{'go': 829,
 'point': 1574,
 'crazy': 459,
 'available': 140,
 'bugis': 275,
 'great': 853,
 'world': 2437,
 'la': 1113,
 'cine': 377,
 'got': 844,
 'wat': 2340,
 'ok': 1458,
 'lar': 1121,
 'joking': 1068,
 'wif': 2398,
 'oni': 1466,
 'free': 765,
 'entry': 630,
 'wkly': 2423,
 'comp': 415,
 'win': 2404,
 'cup': 472,
 'final': 716,
 'tkts': 2170,
 'st': 1963,
 'may': 1283,
 'text': 2108,
 'receive': 1688,
 'question': 1645,
 'std': 1978,
 'txt': 2230,
 'rate': 1667,
 'apply': 97,
 'dun': 589,
 'say': 1784,
 'early': 594,
 'already': 64,
 'nah': 1392,
 'think': 2132,
 'usf': 2274,
 'life': 1163,
 'around': 113,
 'though': 2140,
 'freemsg': 767,
 'hey': 911,
 'darling': 492,
 'week': 2363,
 'word': 2433,
 'back': 154,
 'like': 1169,
 'fun': 787,
 'still': 1980,
 'tb': 2073,
 'xxx': 2465,
 'send': 1818,
 'rcv': 1670,
 'even': 644,
 'brother': 265,
 'speak': 1945,
 'treat': 2208,
 'per': 1524,
 'request': 1725,
 'melle': 1300,
 'oru': 1486,
 'minnaminunginte': 1321,
 'nurungu': 1445,
 'vet

## **Unigram-ngrams**

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500,binary=True,ngram_range=(1,1))
X = cv.fit_transform(corpus).toarray()

In [29]:
cv.vocabulary_

{'go': 829,
 'point': 1574,
 'crazy': 459,
 'available': 140,
 'bugis': 275,
 'great': 853,
 'world': 2437,
 'la': 1113,
 'cine': 377,
 'got': 844,
 'wat': 2340,
 'ok': 1458,
 'lar': 1121,
 'joking': 1068,
 'wif': 2398,
 'oni': 1466,
 'free': 765,
 'entry': 630,
 'wkly': 2423,
 'comp': 415,
 'win': 2404,
 'cup': 472,
 'final': 716,
 'tkts': 2170,
 'st': 1963,
 'may': 1283,
 'text': 2108,
 'receive': 1688,
 'question': 1645,
 'std': 1978,
 'txt': 2230,
 'rate': 1667,
 'apply': 97,
 'dun': 589,
 'say': 1784,
 'early': 594,
 'already': 64,
 'nah': 1392,
 'think': 2132,
 'usf': 2274,
 'life': 1163,
 'around': 113,
 'though': 2140,
 'freemsg': 767,
 'hey': 911,
 'darling': 492,
 'week': 2363,
 'word': 2433,
 'back': 154,
 'like': 1169,
 'fun': 787,
 'still': 1980,
 'tb': 2073,
 'xxx': 2465,
 'send': 1818,
 'rcv': 1670,
 'even': 644,
 'brother': 265,
 'speak': 1945,
 'treat': 2208,
 'per': 1524,
 'request': 1725,
 'melle': 1300,
 'oru': 1486,
 'minnaminunginte': 1321,
 'nurungu': 1445,
 'vet

In [30]:
X = cv.fit_transform(corpus).toarray()
print(X)

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

## **Unigram-Bigrams**

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=500,binary=True,ngram_range=(1,2))
X = cv.fit_transform(corpus).toarray()

In [32]:
cv.vocabulary_

{'go': 152,
 'point': 330,
 'great': 162,
 'world': 483,
 'got': 160,
 'wat': 462,
 'ok': 296,
 'lar': 210,
 'wif': 473,
 'free': 139,
 'entry': 122,
 'win': 475,
 'st': 397,
 'may': 249,
 'text': 412,
 'receive': 349,
 'question': 342,
 'txt': 446,
 'rate': 344,
 'apply': 18,
 'dun': 112,
 'say': 364,
 'early': 114,
 'already': 9,
 'think': 419,
 'life': 225,
 'around': 20,
 'though': 422,
 'hey': 183,
 'week': 466,
 'word': 480,
 'back': 31,
 'like': 226,
 'fun': 145,
 'still': 401,
 'xxx': 490,
 'send': 371,
 'even': 123,
 'brother': 46,
 'speak': 395,
 'per': 311,
 'set': 375,
 'friend': 141,
 'network': 279,
 'customer': 87,
 'selected': 370,
 'prize': 336,
 'claim': 68,
 'call': 51,
 'code': 73,
 'valid': 451,
 'hour': 190,
 'mobile': 262,
 'month': 265,
 'latest': 215,
 'colour': 76,
 'camera': 57,
 'co': 71,
 'free call': 140,
 'gonna': 156,
 'home': 187,
 'soon': 391,
 'want': 460,
 'talk': 409,
 'stuff': 404,
 'tonight': 434,
 'enough': 121,
 'today': 428,
 'chance': 62,
 'ca

In [33]:
X = cv.fit_transform(corpus).toarray()
print(X)

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

## **Bigram-Bigrams**

In [34]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=100,binary=True,ngram_range=(2,2))
X = cv.fit_transform(corpus).toarray()

In [35]:
cv.vocabulary_

{'free entry': 34,
 'claim call': 18,
 'call claim': 3,
 'free call': 33,
 'chance win': 17,
 'txt word': 88,
 'let know': 54,
 'please call': 67,
 'lt gt': 58,
 'want go': 97,
 'like lt': 55,
 'sorry call': 80,
 'call later': 9,
 'ur awarded': 89,
 'call free': 5,
 'call customer': 4,
 'customer service': 25,
 'cash prize': 16,
 'trying contact': 85,
 'draw show': 30,
 'show prize': 79,
 'prize guaranteed': 73,
 'guaranteed call': 43,
 'valid hr': 95,
 'selected receive': 76,
 'private account': 71,
 'account statement': 0,
 'statement show': 81,
 'call identifier': 6,
 'identifier code': 50,
 'code expires': 22,
 'urgent mobile': 94,
 'caller prize': 13,
 'call landline': 8,
 'wat time': 98,
 'ur mob': 92,
 'gud ni': 44,
 'new year': 63,
 'send stop': 78,
 'ur mobile': 93,
 'co uk': 21,
 'nice day': 64,
 'lt decimal': 57,
 'decimal gt': 27,
 'txt nokia': 86,
 'good morning': 38,
 'ur friend': 90,
 'good night': 39,
 'reply call': 75,
 'po box': 69,
 'last night': 53,
 'camera phone':

In [36]:
X = cv.fit_transform(corpus).toarray()
print(X)

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

## **trigram-trigrams**

In [37]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=100,binary=True,ngram_range=(3,3))
X = cv.fit_transform(corpus).toarray()

In [38]:
cv.vocabulary_

{'like lt gt': 45,
 'sorry call later': 81,
 'please call customer': 67,
 'call customer service': 7,
 'customer service representative': 24,
 'guaranteed cash prize': 37,
 'draw show prize': 25,
 'show prize guaranteed': 79,
 'prize guaranteed call': 73,
 'specially selected receive': 83,
 'speak live operator': 82,
 'live operator claim': 47,
 'private account statement': 71,
 'account statement show': 0,
 'call identifier code': 8,
 'identifier code expires': 42,
 'bonus caller prize': 5,
 'match please call': 56,
 'urgent trying contact': 97,
 'lt decimal gt': 49,
 'secret admirer looking': 78,
 'admirer looking make': 1,
 'looking make contact': 48,
 'make contact find': 55,
 'contact find reveal': 21,
 'find reveal think': 30,
 'reveal think ur': 76,
 'think ur special': 87,
 'ur special call': 95,
 'congratulation ur awarded': 20,
 'draw txt music': 26,
 'www ldew com': 99,
 'anytime network min': 2,
 'camcorder reply call': 13,
 'cant pick phone': 14,
 'pick phone right': 66,
 

In [39]:
X = cv.fit_transform(corpus).toarray()
print(X)

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

## **Bigrams-Trigram**

In [40]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=500,binary=True,ngram_range=(2,3))
X = cv.fit_transform(corpus).toarray()

In [41]:
cv.vocabulary_

{'free entry': 134,
 'rate apply': 342,
 'claim call': 61,
 'call claim': 26,
 'claim code': 62,
 'call claim code': 27,
 'update latest': 438,
 'latest colour': 216,
 'free call': 130,
 'call mobile': 38,
 'free call mobile': 131,
 'chance win': 59,
 'win cash': 483,
 'chance win cash': 60,
 'txt word': 431,
 'dont miss': 105,
 'let know': 221,
 'feel like': 125,
 'go home': 152,
 'anything lor': 5,
 'call reply': 44,
 'nokia mobile': 289,
 'mobile free': 264,
 'free camcorder': 132,
 'please call': 317,
 'delivery tomorrow': 101,
 'lt gt': 240,
 'missed call': 261,
 'want go': 471,
 'first time': 129,
 'like lt': 224,
 'like lt gt': 225,
 'sm ac': 378,
 'bx ip': 23,
 'sorry call': 382,
 'call later': 36,
 'later meeting': 213,
 'sorry call later': 383,
 'awarded bonus': 14,
 'prize call': 335,
 'ur awarded': 439,
 'call free': 30,
 'thats cool': 413,
 'hi hi': 186,
 'call customer': 28,
 'customer service': 90,
 'service representative': 367,
 'guaranteed cash': 172,
 'cash prize': 5

In [42]:
X = cv.fit_transform(corpus).toarray()
print(X)

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [43]:
print("The End")

The End
