In [None]:
"""
                ◘ Vectorization
        Process of encoding text as integers to create feature Vectors
        Feature Vector : Vector of numerical features that represent an object.
"""
"""
                ◘ Types of Vectorization
                    • Count Vectorization
                    • N-grams
                    • TF-IDF
"""


### 1. Reading Data

In [2]:
import os
os.chdir("D:\\python\\2.NLTK")
import pandas as pd
import re
import string
import nltk
pd.set_option('display.max_colwidth',100)
stopwods = nltk.corpus.stopwords.words('english')
data = pd.read_csv('SMSSpamCollection',sep='\t',header=None)
data.columns = ['label','msg']
data.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


### Text Cleanig 

In [3]:
def clean_text(txt):
    txt = "".join([c for c in txt if c not in string.punctuation])
    tokens = re.split('\W+',txt)
    txt = [word for word in tokens if word not in stopwods]
    return txt

### CountVectorizer

In [9]:
# in order to understand countVectoriation please study below example
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

corpus = ["This is a sentence is",
          "This is another sentence",
          "third document is here"]
x = cv.fit(corpus)
print(x.vocabulary_)
print(cv.get_feature_names())

x = cv.transform(corpus)
#x = cv.fit_transform(corpus)
print(x.shape)
print(x)
print(x.toarray())


{'this': 6, 'is': 3, 'sentence': 4, 'another': 0, 'third': 5, 'document': 1, 'here': 2}
['another', 'document', 'here', 'is', 'sentence', 'third', 'this']
(3, 7)
  (0, 3)	2
  (0, 4)	1
  (0, 6)	1
  (1, 0)	1
  (1, 3)	1
  (1, 4)	1
  (1, 6)	1
  (2, 1)	1
  (2, 2)	1
  (2, 3)	1
  (2, 5)	1
[[0 0 0 2 1 0 1]
 [1 0 0 1 1 0 1]
 [0 1 1 1 0 1 0]]


In [10]:
df = pd.DataFrame(x.toarray(),columns=cv.get_feature_names())
print(df)

   another  document  here  is  sentence  third  this
0        0         0     0   2         1      0     1
1        1         0     0   1         1      0     1
2        0         1     1   1         0      1     0


### CountVectorization on SMSSpamCollection

In [13]:
cv1 = CountVectorizer(analyzer=clean_text)
x = cv1.fit_transform(data['msg'])
print(x.shape)

(5572, 11525)


In [16]:
data_sample = data[0:10]
cv2 = CountVectorizer(analyzer=clean_text)
x = cv2.fit_transform(data_sample['msg'])
print(x.shape)

(10, 137)


In [17]:
df = pd.DataFrame(x.toarray(),columns=cv2.get_feature_names())
df.head(10)

Unnamed: 0,08002986030,08452810075over18s,09061701461,11,12,150,2,2005,21st,3,...,u,usf,valued,wat,weeks,wif,win,wkly,word,world
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
2,0,1,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,1,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,1,0,0,0,1,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
9,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## N-Grams

#### 01. Reading Data

In [18]:
import os
os.chdir("D:\\python\\2.NLTK")
import pandas as pd
import re
import string
import nltk
pd.set_option('display.max_colwidth',100)
stopwods = nltk.corpus.stopwords.words('english')
data = pd.read_csv('SMSSpamCollection',sep='\t',header=None)
data.columns = ['label','msg']
data.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


#### 02. Cleaning Data

In [27]:
def clean_text(txt):
    txt = "".join([c for c in txt if c not in string.punctuation])
    tokens = re.split('\W+',txt)
    txt = " ".join([word for word in tokens if word not in stopwods])
    return txt
data['msg_clean'] = data['msg'].apply(lambda x: clean_text(x))
data.head()

Unnamed: 0,label,msg,msg_clean
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...",Go jurong point crazy Available bugis n great world la e buffet Cine got amore wat
1,ham,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...,Free entry 2 wkly comp win FA Cup final tkts 21st May 2005 Text FA 87121 receive entry questions...
3,ham,U dun say so early hor... U c already then say...,U dun say early hor U c already say
4,ham,"Nah I don't think he goes to usf, he lives around here though",Nah I dont think goes usf lives around though


In [28]:
cv1 = CountVectorizer(ngram_range=(2,2))
x = cv1.fit_transform(data['msg_clean'])
print(x.shape)

(5572, 34931)


In [30]:
data_sample = data[0:10]
cv2 = CountVectorizer(ngram_range=(2,2))
x = cv2.fit_transform(data_sample['msg_clean'])
print(x.shape)

(10, 126)


In [31]:
df = pd.DataFrame(x.toarray(),columns=cv2.get_feature_names())
df.head(10)

Unnamed: 0,09061701461 claim,11 months,12 hours,150 rcv,2005 text,21st may,87121 receive,900 prize,aids patent,already say,...,valued network,vettam set,weeks word,wif oni,win fa,winner as,wkly comp,word back,world la,xxx std
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,1,1,1,0,0,0,...,0,0,0,0,1,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,0,1
6,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
8,1,0,1,0,0,0,0,1,0,0,...,1,0,0,0,0,1,0,0,0,0
9,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## TF-IDF Vectorizer

In [None]:
"""
                ◘ TF-IDF
    Creates document-term matrix
        • columns are individual unique words
        • cells contain a weight whcih signifies how important a word
          is for an individual text message
    
"""

### 01. Read raw text

In [32]:
import pandas as pd
import re
import string
import nltk

stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

data = pd.read_csv('SMSSpamCollection',sep='\t',header=None)
data.columns = ['label','msg']
data.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


### 02. Text Cleaning 

In [33]:
def clean_text(txt):
    txt = "".join([c for c in txt if c not in string.punctuation])
    tokens = re.split('\w+',txt)
    txt = [ps.stem(word) for word in tokens if word not in stopwods]
    return txt

### 03. TFIDF Vectorizer 

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer()

corpus = ["This is a sentence is",
          "This is another sentence",
          "third document is here"]
x = tfidf_vect.fit(corpus)
print(x.vocabulary_)
print(tfidf_vect.get_feature_names())

x = tfidf_vect.transform(corpus)
print(x.shape)
print(x)
print(x.toarray())


{'this': 6, 'is': 3, 'sentence': 4, 'another': 0, 'third': 5, 'document': 1, 'here': 2}
['another', 'document', 'here', 'is', 'sentence', 'third', 'this']
(3, 7)
  (0, 6)	0.4760629392767929
  (0, 4)	0.4760629392767929
  (0, 3)	0.7394106813498714
  (1, 6)	0.4804583972923858
  (1, 4)	0.4804583972923858
  (1, 3)	0.3731188059313277
  (1, 0)	0.6317450542765208
  (2, 5)	0.546454011634009
  (2, 3)	0.3227445421804912
  (2, 2)	0.546454011634009
  (2, 1)	0.546454011634009
[[0.         0.         0.         0.73941068 0.47606294 0.
  0.47606294]
 [0.63174505 0.         0.         0.37311881 0.4804584  0.
  0.4804584 ]
 [0.         0.54645401 0.54645401 0.32274454 0.         0.54645401
  0.        ]]


In [42]:
df = pd.DataFrame(x.toarray(),columns=tfidf_vect.get_feature_names())
print(df)

    another  document      here        is  sentence     third      this
0  0.000000  0.000000  0.000000  0.739411  0.476063  0.000000  0.476063
1  0.631745  0.000000  0.000000  0.373119  0.480458  0.000000  0.480458
2  0.000000  0.546454  0.546454  0.322745  0.000000  0.546454  0.000000
