In [24]:
import re
from collections import Counter


from urlextract import URLExtract
import nltk
import pandas as pd

In [5]:
def load_dataset(path):
    df = pd.read_csv(path, delimiter=',', encoding='latin-1')
    return [(label, text) for text, label in zip(df.v2, df.v1)]

In [6]:
docs = load_dataset("spam.csv")

In [7]:
docs[:5]

[('ham',
  'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'),
 ('ham', 'Ok lar... Joking wif u oni...'),
 ('spam',
  "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"),
 ('ham', 'U dun say so early hor... U c already then say...'),
 ('ham', "Nah I don't think he goes to usf, he lives around here though")]

In [8]:
Counter(d[0] for d in docs)

Counter({'ham': 4825, 'spam': 747})

In [9]:
for i, (_, text) in enumerate(d for d in docs if d[0] == "spam"):
    if i >= 25:
        break
    print(text)
    print()

Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's

FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv

WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.

Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030

SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info

URGENT! You have won a 1 week FREE membership in our å£100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18

XXXMobileMovieClub: To use your credit, click the WAP link in the next txt messag

In [27]:
class TwitterNLTKTokenizer:
    DIGITS_RE = re.compile(r'[0-9]+')
    
    def __init__(self):
        self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        self.tokenizer = nltk.tokenize.TweetTokenizer(preserve_case=False, reduce_len=True)
        self.url_extractor = URLExtract()
        
    def tokenize(self, text):
        text = DIGITS_RE.sub('0', text)
        tokens = []
        for sent in self.sent_detector.tokenize(text):
            urls = self.url_extractor.find_urls(sent, only_unique=True)
            for url in urls:
                sent = sent.replace(url, "<href>")
            tokens.extend(self.tokenizer.tokenize(sent))            
                
        return tokens

tokenizer = TwitterNLTKTokenizer()
tokenized_docs = [(label, tokenizer.tokenize(text)) for label, text in docs]

In [28]:
for (_, text), (_, tokenized_text) in zip(docs[:5], tokenized_docs):
    print(text)
    print(tokenized_text)
    print()

Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
['go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'cine', 'there', 'got', 'amore', 'wat', '...']

Ok lar... Joking wif u oni...
['ok', 'lar', '...', 'joking', 'wif', 'u', 'oni', '...']

Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
['free', 'entry', 'in', '0', 'a', 'wkly', 'comp', 'to', 'win', 'fa', 'cup', 'final', 'tkts', '0st', 'may', '0', '.', 'text', 'fa', 'to', '0', 'to', 'receive', 'entry', 'question', '(', 'std', 'txt', 'rate', ')', 't', '&', "c's", 'apply', '0over0', "'", 's']

U dun say so early hor... U c already then say...
['u', 'dun', 'say', 'so', 'early', 'hor', '...', 'u', 'c', 'already', 'then', 'say', '...']

Nah I don't think he goes to usf, he lives around he