In [1]:
import nltk
from nltk.tokenize import word_tokenize
import pandas as pd
import string
import re
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
input_text = "I am learning NLP and using NLTK"

In [None]:
word_tokenize(input_text)

In [2]:
raw_text = open('data/SMSSpamCollection').read()
raw_text[:5]

'ham\tG'

In [3]:
parsed_data = raw_text.replace('\t', '\n').split('\n')
parsed_data[:5]

['ham',
 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 'ham',
 'Ok lar... Joking wif u oni...',
 'spam']

In [4]:
label_list = parsed_data[::2]
msg_list = parsed_data[1::2]

In [5]:
pd.set_option('display.max_colwidth', 100)
data = pd.DataFrame(data={'label': label_list[:-1], 'msg': msg_list}, columns=['label', 'msg'])
data.head()

Unnamed: 0,label,msg
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


## Remove Punctuations

In [None]:
def remove_punctuation(text):
    no_punt = "".join([c for c in text if c not in string.punctuation])
    return no_punt

In [None]:
data['msg_clean'] = data['msg'].apply(lambda x: remove_punctuation(x))
data.head()

## Tokenization

In [None]:
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

In [None]:
data['msg_clean_tokenized'] = data['msg_clean'].apply(lambda x: tokenize(x.lower()))
data.head()

## Remove stop words

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

In [None]:
def remove_stopwords(text):
    text_clean = [word for word in text if word not in stopwords]
    return text_clean

In [None]:
data['msg_no_sw'] = data['msg_clean_tokenized'].apply(lambda x: remove_stopwords(x))
data.head()

## Porter stemmer

In [None]:
ps = PorterStemmer()

In [None]:
def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

In [None]:
data['msg_stem'] = data['msg_no_sw'].apply(lambda x: stemming(x))
data.head()

## Lemmatizer

In [None]:
wn = nltk.WordNetLemmatizer()

In [None]:
def lemmatization(token_text):
    text = [wn.lemmatize(word) for word in token_text]
    return text

In [None]:
data['msg_lemmatized'] = data['msg_no_sw'].apply(lambda x: lemmatization(x))
data.head()

## Count vectorization

In [None]:
cv = CountVectorizer()

In [None]:
corpus = ['This is a sentence is', 'This is another sentence', 'Third document is here']

x = cv.fit(corpus)
print(x.vocabulary_)
print(cv.get_feature_names())

x = cv.transform(corpus)
print(x.shape)
print(x.toarray())

In [None]:
stopwords = nltk.corpus.stopwords.words('english')

def clean_text(text):
    # remove punctuations
    no_punt = "".join([c for c in text if c not in string.punctuation]).lower()
    # tokenize word
    token = re.split('\W+', no_punt)
    # stem and remove stop word
    txt = [ps.stem(word) for word in token if word not in stopwords]
    
    return txt

In [None]:
cv_1 = CountVectorizer(analyzer=clean_text)

In [None]:
X = cv_1.fit_transform(data['msg'])
X.shape

In [None]:
df = pd.DataFrame(X.toarray(), columns=cv_1.get_feature_names())
df.head()

## N-Grams vectorization

In [6]:
stopwords = nltk.corpus.stopwords.words('english')
ps = PorterStemmer()

In [None]:
def clean_text(text):
    # remove punctuations
    no_punt = "".join([c for c in text if c not in string.punctuation]).lower()
    # tokenize word
    token = re.split('\W+', no_punt)
    # stem and remove stop word
    txt = " ".join([ps.stem(word) for word in token if word not in stopwords])
    
    return txt

In [None]:
data.head()

In [None]:
data['msg_clean'] = data['msg'].apply(lambda x: clean_text(x))
data.head()

In [None]:
cv = CountVectorizer(ngram_range=(2,3))

In [None]:
X = cv.fit_transform(data['msg_clean'])
X.shape

In [None]:
data_sample = data[:10]
cv1 = CountVectorizer(ngram_range=(2, 2))

x = cv1.fit_transform(data_sample['msg_clean'])
x.shape

In [None]:
df = pd.DataFrame(x.toarray(), columns=cv1.get_feature_names())
df.head()

## TF IDF vectorization