In [1]:
from pprint import pprint

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import acquire
import prepare
import env

In [2]:
url = env.get_db_url('spam_db')
query = 'SELECT * FROM spam'

In [3]:
df = pd.read_sql(query, url, index_col='id')

In [4]:
df.head()

Unnamed: 0_level_0,label,text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
df = df.rename(columns={'text': 'content'})
df = prepare.prep_data(df)

In [6]:
df.head()

Unnamed: 0_level_0,label,original,clean,stemmed,lemmatized
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,go jurong point crazi avail onli bugi n great ...,go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,ok lar joke wif u oni,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,free entri 2 wkli comp win fa cup final tkt 21...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say,u dun say earli hor u c alreadi say,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah ' think goes usf lives around though,nah ' think goe usf live around though,nah ' think go usf life around though


In [8]:
words = ' '.join(df.clean).split()

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
cv = CountVectorizer()
bag_of_words = cv.fit_transform(words)

In [15]:
bag_of_words.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [16]:
cv.get_feature_names_out()

array(['008704050406', '0089my', '0121', ..., 'zoom', 'zouk', 'zyada'],
      dtype=object)

In [17]:
cv.vocabulary_

{'go': 3766,
 'jurong': 4651,
 'point': 6371,
 'crazy': 2491,
 'available': 1412,
 'bugis': 1874,
 'great': 3863,
 'world': 9073,
 'la': 4808,
 'buffet': 1872,
 'cine': 2207,
 'got': 3823,
 'amore': 1181,
 'wat': 8842,
 'ok': 5937,
 'lar': 4847,
 'joking': 4619,
 'wif': 8969,
 'oni': 5969,
 'free': 3558,
 'entry': 3146,
 'wkly': 9025,
 'comp': 2323,
 'win': 8983,
 'fa': 3280,
 'cup': 2546,
 'final': 3404,
 'tkts': 8285,
 '21st': 430,
 'may': 5291,
 '2005': 417,
 'text': 8129,
 '87121': 826,
 'receive': 6766,
 'questionstd': 6658,
 'txt': 8491,
 'ratetc': 6710,
 'apply': 1269,
 '08452810075over18': 71,
 'dun': 2997,
 'say': 7119,
 'early': 3018,
 'hor': 4191,
 'already': 1155,
 'nah': 5628,
 'think': 8198,
 'goes': 3780,
 'usf': 8637,
 'lives': 5009,
 'around': 1319,
 'though': 8217,
 'freemsg': 3566,
 'hey': 4088,
 'darling': 2610,
 'week': 8889,
 'word': 9059,
 'back': 1461,
 'like': 4959,
 'fun': 3629,
 'still': 7774,
 'tb': 8059,
 'xxx': 9196,
 'std': 7749,
 'chgs': 2153,
 'send': 7

In [18]:
pprint(words)

['go',
 'jurong',
 'point',
 'crazy',
 'available',
 'bugis',
 'n',
 'great',
 'world',
 'la',
 'e',
 'buffet',
 'cine',
 'got',
 'amore',
 'wat',
 'ok',
 'lar',
 'joking',
 'wif',
 'u',
 'oni',
 'free',
 'entry',
 '2',
 'wkly',
 'comp',
 'win',
 'fa',
 'cup',
 'final',
 'tkts',
 '21st',
 'may',
 '2005',
 'text',
 'fa',
 '87121',
 'receive',
 'entry',
 'questionstd',
 'txt',
 'ratetc',
 "'",
 'apply',
 '08452810075over18',
 "'",
 'u',
 'dun',
 'say',
 'early',
 'hor',
 'u',
 'c',
 'already',
 'say',
 'nah',
 "'",
 'think',
 'goes',
 'usf',
 'lives',
 'around',
 'though',
 'freemsg',
 'hey',
 'darling',
 "'",
 '3',
 'week',
 "'",
 'word',
 'back',
 "'",
 'like',
 'fun',
 'still',
 'tb',
 'ok',
 'xxx',
 'std',
 'chgs',
 'send',
 'a150',
 'rcv',
 'even',
 'brother',
 'like',
 'speak',
 'treat',
 'like',
 'aids',
 'patent',
 'per',
 'request',
 "'",
 'melle',
 'melle',
 'oru',
 'minnaminunginte',
 'nurungu',
 'vettam',
 "'",
 'set',
 'callertune',
 'callers',
 'press',
 '9',
 'copy',
 'fri

 'pleassssssseeeeee',
 'tel',
 'v',
 'avent',
 'done',
 'sportsx',
 'okay',
 'shining',
 'meant',
 'signing',
 'sounds',
 'better',
 'although',
 'told',
 'u',
 'dat',
 "'",
 'baig',
 'face',
 'watches',
 'really',
 'like',
 'e',
 'watch',
 'u',
 'gave',
 'cos',
 "'",
 'fr',
 'u',
 'thanx',
 '4',
 'everything',
 'dat',
 'u',
 "'",
 'done',
 'today',
 "'",
 'touched',
 'u',
 "'",
 'remember',
 'old',
 'commercial',
 'late',
 'said',
 'website',
 "'",
 'dont',
 'slippers',
 'asked',
 'call',
 'ok',
 'kallis',
 'wont',
 'bat',
 '2nd',
 'innings',
 'didnt',
 'work',
 'oh',
 'ok',
 'goodnight',
 'ill',
 'fix',
 'ready',
 'time',
 'wake',
 'dearly',
 'missed',
 'good',
 'night',
 'sleep',
 'congratulations',
 'ur',
 'awarded',
 '500',
 'cd',
 'vouchers',
 '125gift',
 'guaranteed',
 'free',
 'entry',
 '2',
 '100',
 'wkly',
 'draw',
 'txt',
 'music',
 '87066',
 'tncs',
 'wwwldewcom1win150ppmx3age16',
 'ranjith',
 'cal',
 'drpd',
 'deeraj',
 'deepak',
 '5min',
 'hold',
 'wen',
 'ur',
 'lovable'

 '3',
 'kusruthi',
 '4',
 'lovable',
 '5',
 'silent',
 '6',
 'spl',
 'character',
 '7',
 'matured',
 '8',
 'stylish',
 '9',
 'simple',
 'pls',
 'reply',
 '8',
 'latest',
 'g',
 "'",
 'still',
 'scrounge',
 'ammo',
 'want',
 'give',
 'new',
 'ak',
 'try',
 'prabhai',
 "'",
 'sorydarealyfrm',
 'heart',
 "'",
 'sory',
 'lol',
 'ok',
 'forgiven',
 'nojst',
 'change',
 'tat',
 'guaranteed',
 'latest',
 'nokia',
 'phone',
 '40gb',
 'ipod',
 'mp3',
 'player',
 'a500',
 'prize',
 'txt',
 'word',
 'collect',
 '83355',
 'ibhltd',
 'ldnw15h',
 '150pmtmsgrcvd18',
 'sno',
 'competition',
 'boltblue',
 'tones',
 '150p',
 'reply',
 'poly',
 'mono',
 'eg',
 'poly3',
 '1',
 'cha',
 'cha',
 'slide',
 '2',
 'yeah',
 '3',
 'slow',
 'jamz',
 '6',
 'toxic',
 '8',
 'come',
 'stop',
 '4',
 'tones',
 'txt',
 'credits',
 'topped',
 'httpwwwbubbletextcom',
 'renewal',
 'pin',
 'tgxxrz',
 'way',
 'transport',
 'less',
 'problematic',
 'sat',
 'night',
 'way',
 'u',
 'want',
 'ask',
 'n',
 'join',
 'bday',
 'feel'

 'tomorrow',
 'got',
 'sing',
 'guy',
 'gave',
 'card',
 'xxx',
 'happy',
 'new',
 'year',
 'dear',
 'brother',
 'really',
 'miss',
 'got',
 'number',
 'decided',
 'send',
 'text',
 'wishing',
 'happiness',
 'abiola',
 'means',
 'get',
 'door',
 'opinion',
 '1',
 '2',
 'jada',
 '3',
 'kusruthi',
 '4',
 'lovable',
 '5',
 'silent',
 '6',
 'spl',
 'character',
 '7',
 'matured',
 '8',
 'stylish',
 '9',
 'simple',
 'pls',
 'reply',
 'hmmm',
 'thought',
 'said',
 '2',
 'hours',
 'slave',
 '3',
 'late',
 'punish',
 'beerage',
 'important',
 'customer',
 'service',
 'announcement',
 'premier',
 'call',
 'freephone',
 '0800',
 '542',
 '0578',
 'dont',
 'think',
 'turns',
 'like',
 'randomlly',
 'within',
 '5min',
 'opening',
 'supposed',
 "'",
 'make',
 "'",
 'still',
 'town',
 'though',
 'time',
 'fixes',
 'spelling',
 'sometimes',
 'gets',
 'completely',
 'diff',
 'word',
 'go',
 'figure',
 'ever',
 'thought',
 'living',
 'good',
 'life',
 'perfect',
 'partner',
 'txt',
 'back',
 'name',
 'ag

 'tonight',
 'may',
 'call',
 'later',
 'pls',
 "'",
 'pattern',
 'recently',
 'crap',
 'weekends',
 'sore',
 'throat',
 "'",
 'scratches',
 'talk',
 'yes',
 'da',
 'plm',
 'ur',
 'office',
 'around',
 'still',
 'asleep',
 'v',
 'lol',
 'forgot',
 'eh',
 'yes',
 "'",
 'bring',
 'babe',
 'good',
 "'",
 'find',
 'way',
 'use',
 'foreign',
 'stamps',
 'country',
 'good',
 'lecture',
 'yup',
 'bathe',
 'liao',
 'happy',
 'new',
 'year',
 'no1',
 'man',
 'oh',
 'mr',
 'sheffield',
 'wanna',
 'play',
 'game',
 'okay',
 "'",
 'boss',
 "'",
 'nanny',
 'give',
 'raise',
 "'",
 'give',
 'one',
 'zoe',
 'hit',
 '2',
 'im',
 'fucking',
 'shitin',
 'il',
 'defo',
 'try',
 'hardest',
 '2',
 'cum',
 '2morow',
 'luv',
 'u',
 'millions',
 'lekdog',
 'hello',
 'baby',
 'get',
 'back',
 'mom',
 "'",
 'setting',
 'computer',
 'filling',
 'belly',
 'goes',
 'loverboy',
 'miss',
 'already',
 'sighs',
 'blankets',
 'sufficient',
 'thx',
 'naughty',
 'little',
 'thought',
 "'",
 'better',
 'flirt',
 'flirt',


 'thanks',
 'wishes',
 'happy',
 'birthday',
 'may',
 'ur',
 'dreams',
 'come',
 'true',
 'aiyah',
 'u',
 'ok',
 'already',
 'lar',
 'e',
 'nydc',
 'wheellock',
 'tell',
 'said',
 'eat',
 'shit',
 'sure',
 'driving',
 'reach',
 'destination',
 'soon',
 'k',
 'much',
 '8th',
 'fifty',
 'daily',
 'text',
 'uo',
 'favour',
 'time',
 'great',
 'hear',
 'settling',
 'well',
 "'",
 'happenin',
 'wit',
 'ola',
 'cocksuckers',
 'makes',
 'feel',
 'better',
 'ipads',
 'worthless',
 'garbage',
 'novelty',
 'items',
 'feel',
 'bad',
 'even',
 'wanting',
 'one',
 'tot',
 'u',
 'reach',
 'liao',
 'said',
 'tshirt',
 'fran',
 'decided',
 '2',
 'go',
 'n',
 'e',
 'way',
 'im',
 'completely',
 'broke',
 'knackered',
 'got',
 'bout',
 '3',
 'c',
 'u',
 '2mrw',
 'love',
 'janx',
 'ps',
 'dads',
 'fone',
 'credit',
 'cant',
 'pick',
 'phone',
 'right',
 'pls',
 'send',
 'message',
 'right',
 "'",
 'make',
 'appointment',
 'right',
 'designation',
 'software',
 'developer',
 'may',
 'get',
 'chennai',
 'e

 'south',
 'tampa',
 'preferably',
 'kegger',
 'e',
 'msg',
 'jus',
 'u',
 'said',
 'thanks',
 'gift',
 'u',
 'ok',
 'dear',
 'call',
 'chechi',
 'yeah',
 'totes',
 'u',
 'wanna',
 'ok',
 'found',
 'dis',
 'pierre',
 'cardin',
 'one',
 'looks',
 'normal',
 'costs',
 '20',
 'sale',
 'good',
 'sleep',
 'rhythm',
 'person',
 'establish',
 'rhythm',
 'body',
 'learn',
 'use',
 'want',
 'know',
 'wat',
 'r',
 'u',
 'message',
 'truro',
 'hospital',
 'ext',
 'phone',
 'phone',
 'side',
 'single',
 'line',
 'big',
 'meaning',
 'miss',
 'anything',
 '4',
 'ur',
 'best',
 'life',
 'got',
 'gas',
 'money',
 'chance',
 'gang',
 'want',
 'go',
 'grand',
 'nature',
 'adventure',
 'dnt',
 'worryuse',
 'ice',
 'pieces',
 'cloth',
 'packalso',
 'take',
 '2',
 'tablets',
 'dude',
 'saw',
 'parked',
 'car',
 'sunroof',
 'popped',
 'sux',
 'get',
 'ready',
 'put',
 'excellent',
 'sub',
 'face',
 'tmrw',
 'im',
 'finishing',
 '9',
 'doors',
 'ltgt',
 'g',
 'saw',
 'days',
 'ago',
 'guy',
 'wants',
 'sell'

 'motorola',
 'upto',
 '12mths',
 '12price',
 'linerental',
 '500',
 'free',
 'xnet',
 'mins100txtmth',
 'free',
 'b',
 "'",
 'tooth',
 'call',
 'mobileupd8',
 '08001950382',
 'call',
 '2optoutd3wv',
 'dont',
 'want',
 'hear',
 'philosophy',
 'say',
 'happen',
 'got',
 'job',
 'wiproyou',
 'get',
 'every',
 'thing',
 'life',
 '2',
 '3',
 'years',
 'cant',
 'get',
 'da',
 'laptop',
 'matric',
 'card',
 'wif',
 'lei',
 'dunno',
 'da',
 'next',
 'show',
 'aft',
 '6',
 '850',
 'toa',
 'payoh',
 'got',
 '650',
 '2nd',
 'time',
 'tried',
 '2',
 'contact',
 'u',
 'u',
 '750',
 'pound',
 'prize',
 '2',
 'claim',
 'easy',
 'call',
 '08718726970',
 '10p',
 'per',
 'min',
 'btnationalrate',
 'made',
 'payments',
 'dont',
 'much',
 'sorry',
 'would',
 'want',
 'fedex',
 'way',
 "'",
 'play',
 'one',
 'day',
 'last',
 'year',
 'know',
 'even',
 'though',
 'good',
 'team',
 'like',
 'india',
 'kyou',
 'girl',
 'waiting',
 'reception',
 'ah',
 'say',
 'slowly',
 'godi',
 'love',
 'amp',
 'need',
 'yo

 'birthday',
 'vikky',
 'u',
 'win',
 'a100',
 'music',
 'gift',
 'vouchers',
 'every',
 'week',
 'starting',
 'txt',
 'word',
 'draw',
 '87066',
 'tscs',
 'wwwidewcom',
 'skillgame',
 '1winaweek',
 'age16',
 '150ppermesssubscription',
 'hope',
 'know',
 "'",
 'still',
 'mad',
 'argh',
 '3g',
 'spotty',
 'anyway',
 'thing',
 'remember',
 'research',
 'province',
 'sterling',
 'problemfree',
 'places',
 'looked',
 'xam',
 'hall',
 'boy',
 'asked',
 'girl',
 'tell',
 'starting',
 'term',
 'dis',
 'answer',
 'den',
 'manage',
 'lot',
 'hesitation',
 'n',
 'lookin',
 'around',
 'silently',
 'said',
 'intha',
 'ponnungale',
 'ipaditan',
 'know',
 'result',
 '123',
 'congratulations',
 'week',
 "'",
 'competition',
 'draw',
 'u',
 'a1450',
 'prize',
 'claim',
 'call',
 '09050002311',
 'b4280703',
 'tcsstop',
 'sms',
 '08718727868',
 '18',
 '150ppm',
 'beautiful',
 'truth',
 'gravity',
 'read',
 'carefully',
 'heart',
 'feels',
 'light',
 'someone',
 'feels',
 'heavy',
 'someone',
 'leaves',


 'rite',
 'sary',
 'asusual1',
 'u',
 'cheered',
 'love',
 'u',
 'franyxxxxx',
 "'",
 'way',
 'home',
 'went',
 'change',
 'batt',
 '4',
 'watch',
 'go',
 'shop',
 'bit',
 'lor',
 'yes',
 'place',
 'town',
 'meet',
 'exciting',
 'adult',
 'singles',
 'uk',
 'txt',
 'chat',
 '86688',
 '150pmsg',
 'hi',
 'mobile',
 'ltgt',
 'added',
 'contact',
 'list',
 'wwwfullonsmscom',
 'great',
 'place',
 'send',
 'free',
 'sms',
 'people',
 'visit',
 'fullonsmscom',
 'good',
 'evening',
 'sir',
 'hope',
 'nice',
 'day',
 'wanted',
 'bring',
 'notice',
 'late',
 'paying',
 'rent',
 'past',
 'months',
 'pay',
 'ltgt',
 'charge',
 'felt',
 'would',
 'inconsiderate',
 'nag',
 'something',
 'give',
 'great',
 'cost',
 "'",
 'didnt',
 'speak',
 'however',
 'recession',
 'wont',
 'able',
 'pay',
 'charge',
 'month',
 'hence',
 'askin',
 'well',
 'ahead',
 'month',
 "'",
 'end',
 'please',
 'help',
 'thank',
 'everything',
 'let',
 'want',
 'house',
 '8am',
 'best',
 'line',
 'said',
 'love',
 'wait',
 'ti

 'come',
 'anna',
 'nagar',
 'go',
 'afternoon',
 "'",
 'okay',
 'chasing',
 'dream',
 "'",
 'good',
 'next',
 'yupz',
 "'",
 'oredi',
 'booked',
 'slots',
 '4',
 'weekends',
 'liao',
 'urgent',
 'trying',
 'contact',
 'u',
 'todays',
 'draw',
 'shows',
 'a800',
 'prize',
 'guaranteed',
 'call',
 '09050003091',
 'land',
 'line',
 'claim',
 'c52',
 'valid',
 '12hrs',
 'r',
 'many',
 'modelsony',
 'ericson',
 'also',
 'der',
 'ltgt',
 'luks',
 'good',
 'bt',
 'forgot',
 'modl',
 'okie',
 'yes',
 'know',
 'cheesy',
 'songs',
 'frosty',
 'snowman',
 'ya',
 'ok',
 'vikky',
 'vl',
 'c',
 'witin',
 'ltgt',
 'mins',
 'il',
 'reply',
 'u',
 'sports',
 'fans',
 'get',
 'latest',
 'sports',
 'news',
 'str',
 '2',
 'ur',
 'mobile',
 '1',
 'wk',
 'free',
 'plus',
 'free',
 'tone',
 'txt',
 'sport',
 '8007',
 'wwwgetzedcouk',
 '0870141701216',
 'norm',
 '4txt120p',
 'hey',
 'tmr',
 'meet',
 'bugis',
 '930',
 'urgent',
 'urgent',
 '800',
 'free',
 'flights',
 'europe',
 'give',
 'away',
 'call',
 'b4

 'u',
 'always',
 'ignorant',
 'nope',
 "'",
 'b',
 'going',
 '2',
 'sch',
 'fri',
 'quite',
 'early',
 'lor',
 'cos',
 'mys',
 'sis',
 'got',
 'paper',
 'da',
 'morn',
 'bruce',
 'b',
 'downs',
 'amp',
 'fletcher',
 'said',
 'would',
 'woke',
 'hey',
 'free',
 'call',
 'tell',
 'whos',
 'pls',
 'urgent',
 'mobile',
 'awarded',
 'a1500',
 'bonus',
 'caller',
 'prize',
 '27603',
 'final',
 'attempt',
 '2',
 'contact',
 'u',
 'call',
 '08714714011',
 'think',
 'might',
 'give',
 'miss',
 'teaching',
 'til',
 'twelve',
 'lecture',
 'two',
 'damn',
 'working',
 'thing',
 'id',
 'check',
 "'",
 'like',
 '1',
 'bowls',
 'worth',
 'left',
 'yes',
 'many',
 'sweets',
 'would',
 "'",
 'still',
 'cozy',
 'exhausted',
 'last',
 'nightnobody',
 'went',
 'school',
 'work',
 'everything',
 'closed',
 'u',
 'secret',
 'admirer',
 'reveal',
 'thinks',
 'u',
 'r',
 'special',
 'call',
 '09065174042',
 'opt',
 'reply',
 'reveal',
 'stop',
 '150',
 'per',
 'msg',
 'recd',
 'cust',
 'care',
 '07821230901'

 'good',
 'r',
 'u',
 'working',
 'oh',
 'yes',
 "'",
 'little',
 'weather',
 "'",
 'kind',
 'coccooning',
 'home',
 'home',
 'also',
 'phone',
 'weirdest',
 'auto',
 'correct',
 'oops',
 'phone',
 'died',
 "'",
 'even',
 'know',
 'yeah',
 'like',
 'better',
 'havent',
 'mus',
 'ask',
 'u',
 '1st',
 'wat',
 'meet',
 '4',
 'lunch',
 'den',
 'u',
 'n',
 'meet',
 'already',
 'lor',
 'u',
 'wan',
 '2',
 'go',
 'ask',
 'da',
 'ge',
 '1st',
 'confirm',
 'w',
 'asap',
 'said',
 "'",
 "'",
 'u',
 'mind',
 'go',
 'bedroom',
 'minute',
 "'",
 "'",
 "'",
 "'",
 'ok',
 "'",
 "'",
 'sed',
 'sexy',
 'mood',
 'came',
 '5',
 'minuts',
 'latr',
 'wid',
 'caken',
 'wife',
 'oh',
 'yeahand',
 'hav',
 'great',
 'time',
 'newquaysend',
 'postcard',
 '1',
 'look',
 'girls',
 'im',
 'goneu',
 'know',
 '1im',
 'talkin',
 'boutxx',
 'got',
 'divorce',
 'lol',
 'shes',
 "'",
 'ur',
 'pin',
 'babe',
 'got',
 'enough',
 'money',
 'pick',
 'bread',
 'milk',
 "'",
 'give',
 'back',
 'get',
 'home',
 'want',
 'snow'

 'still',
 'checked',
 'da',
 "'",
 'also',
 'came',
 'room',
 'huh',
 'got',
 'lesson',
 '4',
 'lei',
 'n',
 'thinkin',
 'going',
 'sch',
 'earlier',
 'n',
 'tot',
 'parkin',
 'kent',
 'vale',
 'ok',
 'reach',
 'office',
 'around',
 'ltdecimalgt',
 'amp',
 'mobile',
 'problem',
 'cann',
 "'",
 'get',
 'voice',
 'call',
 'asa',
 "'",
 'free',
 'cool',
 'text',
 'head',
 'contacted',
 'dating',
 'service',
 'someone',
 'know',
 'find',
 'call',
 'land',
 'line',
 '09050000878',
 'pobox45w2tg150p',
 'wan2',
 'win',
 'meetgreet',
 'westlife',
 '4',
 'u',
 'm8',
 'currently',
 'tour',
 '1unbreakable',
 '2untamed',
 '3unkempt',
 'text',
 '12',
 '3',
 '83049',
 'cost',
 '50p',
 'std',
 'text',
 'happy',
 'birthday',
 'may',
 'u',
 'find',
 'ur',
 'prince',
 'charming',
 'soon',
 'n',
 'dun',
 'work',
 'hard',
 'oh',
 'grand',
 'bit',
 'party',
 "'",
 'mention',
 'cover',
 'charge',
 "'",
 'probably',
 'first',
 'come',
 'first',
 'served',
 'said',
 'went',
 'back',
 'bed',
 "'",
 'sleep',
 

 'hostel',
 'going',
 'sleep',
 'plz',
 'call',
 'class',
 'hrishi',
 'ok',
 'bag',
 'hi',
 'spoke',
 'maneesha',
 'v',
 "'",
 'like',
 'know',
 'satisfied',
 'experience',
 'reply',
 'toll',
 'free',
 'yes',
 'ok',
 'lor',
 'msg',
 'b4',
 'u',
 'call',
 'mila',
 'age23',
 'blonde',
 'new',
 'uk',
 'look',
 'sex',
 'uk',
 'guys',
 'u',
 'like',
 'fun',
 'text',
 'mtalk',
 '6986618',
 '30pptxt',
 '1st',
 '5free',
 'a150',
 'increments',
 'help08718728876',
 'fishrman',
 'woke',
 'early',
 'mrng',
 'dark',
 'waited',
 'amp',
 'found',
 'sack',
 'ful',
 'stones',
 'strtd',
 'throwin',
 'thm',
 'in2',
 'sea',
 '2',
 'pass',
 'time',
 'atlast',
 'jus',
 '1stone',
 'sun',
 'rose',
 'amp',
 'found',
 'tht',
 'r',
 'nt',
 'stones',
 'diamonds',
 'moraldont',
 'wake',
 'early',
 'mrng',
 "'",
 "'",
 'good',
 'night',
 'claim',
 '200',
 'shopping',
 'spree',
 'call',
 '08717895698',
 'mobstorequiz10ppm',
 'ur',
 'physics',
 'get',
 'dear',
 'friends',
 'sorry',
 'late',
 'information',
 'today',

 'first',
 'name',
 '82277unsub',
 'stop',
 'a150',
 '008704050406',
 'sp',
 'weeks',
 'savamob',
 'member',
 'offers',
 'accessible',
 'call',
 '08709501522',
 'details',
 'savamob',
 'pobox',
 '139',
 'la3',
 '2wu',
 'a150week',
 'savamob',
 'offers',
 'mobile',
 'aight',
 "'",
 'set',
 'free',
 'think',
 'could',
 'text',
 'blake',
 "'",
 'address',
 'occurs',
 "'",
 'quite',
 'sure',
 "'",
 'thought',
 'hi',
 'dear',
 'saw',
 'dear',
 'happy',
 'battery',
 'low',
 'ages',
 "'",
 'abj',
 'prof',
 'passed',
 'papers',
 'sem',
 'congrats',
 'student',
 'enna',
 'kalaachutaarama',
 'prof',
 'gud',
 'mrng',
 'dont',
 'kick',
 'coco',
 "'",
 'fyi',
 "'",
 'gonna',
 'call',
 'sporadically',
 'starting',
 'like',
 'ltgt',
 'bc',
 'doin',
 'shit',
 'contacted',
 'dating',
 'service',
 'someone',
 'know',
 'find',
 'call',
 'mobile',
 'landline',
 '09064017305',
 'pobox75ldns7',
 'tbspersolvo',
 'chasing',
 'us',
 'since',
 'sept',
 'fora38',
 'definitely',
 'paying',
 'thanks',
 'informatio

 'hey',
 'mr',
 'going',
 'sea',
 'view',
 'couple',
 'gays',
 'mean',
 'games',
 'give',
 'bell',
 'ya',
 'finish',
 'k',
 'jason',
 'says',
 "'",
 'gonna',
 'around',
 "'",
 'around',
 'ltgt',
 'sorry',
 'able',
 'get',
 'see',
 'morning',
 'aight',
 'well',
 'keep',
 'informed',
 'number',
 'sir',
 'searching',
 'good',
 'dual',
 'sim',
 'mobile',
 'pa',
 'seems',
 'unnecessarily',
 'hostile',
 'dude',
 'got',
 'haircut',
 'breezy',
 'congrats',
 '2',
 'mobile',
 '3g',
 'videophones',
 'r',
 'call',
 '09061744553',
 'videochat',
 'wid',
 'ur',
 'mates',
 'play',
 'java',
 'games',
 'dload',
 'polyh',
 'music',
 'noline',
 'rentl',
 'bx420',
 'ip4',
 '5we',
 '150pm',
 '1appledayno',
 'doctor',
 '1tulsi',
 'leafdayno',
 'cancer',
 '1lemondayno',
 'fat',
 '1cup',
 'milkdayno',
 'bone',
 'problms',
 '3',
 'litres',
 'watrdayno',
 'diseases',
 'snd',
 'ths',
 '2',
 'u',
 'care',
 'thought',
 'king',
 'hill',
 'thing',
 'nope',
 "'",
 'come',
 'online',
 'also',
 'tell',
 'said',
 'happy'

 'til',
 'late',
 'thanx',
 '4',
 'puttin',
 'da',
 'fone',
 'need',
 '8th',
 "'",
 'campus',
 'atm',
 'could',
 'pick',
 'hour',
 'two',
 'oh',
 'haha',
 'den',
 'shld',
 'went',
 'today',
 'gee',
 'nvm',
 'la',
 'kaiez',
 'dun',
 'mind',
 'goin',
 'jazz',
 'oso',
 'scared',
 'hiphop',
 'open',
 'cant',
 'catch',
 'running',
 'managed',
 '5',
 'minutes',
 'needed',
 'oxygen',
 'might',
 'resort',
 'roller',
 'option',
 'live',
 'next',
 'ltgt',
 'mins',
 'de',
 'asking',
 'like',
 'glad',
 'talking',
 'wat',
 'time',
 'finish',
 'sorry',
 'da',
 'gone',
 'mad',
 'many',
 'pending',
 'works',
 'much',
 'got',
 'cleaning',
 'hows',
 'favourite',
 'person',
 'today',
 'r',
 'u',
 'workin',
 'hard',
 "'",
 'sleep',
 'last',
 'nite',
 'nearly',
 'rang',
 'u',
 '430',
 'sunshine',
 'quiz',
 'win',
 'super',
 'sony',
 'dvd',
 'recorder',
 'canname',
 'capital',
 'australia',
 'text',
 'mquiz',
 '82277',
 'b',
 'ii',
 'called',
 'dad',
 'oredi',
 'good',
 'think',
 'could',
 'send',
 'pix',
 

 'msgticketkioskvalid',
 '4712',
 'c',
 'tc',
 'kiosk',
 'reply',
 'sony',
 '4',
 'mre',
 'film',
 'offers',
 'come',
 'online',
 'today',
 'night',
 'anything',
 'special',
 "'",
 'solihull',
 'want',
 'anything',
 'good',
 'day',
 'regret',
 'inform',
 'u',
 'nhs',
 'made',
 'mistakeu',
 'never',
 'actually',
 'bornplease',
 'report',
 '2',
 'yor',
 'local',
 'hospital',
 '2b',
 'terminatedwe',
 'r',
 'sorry',
 '4',
 'inconvenience',
 'love',
 'holiday',
 'monday',
 'feeling',
 'even',
 'go',
 'dentists',
 'hour',
 'way',
 'tirupur',
 'goal',
 'arsenal',
 '4',
 'henry',
 '7',
 'v',
 'liverpool',
 '2',
 'henry',
 'scores',
 'simple',
 'shot',
 '6',
 'yards',
 'pass',
 'bergkamp',
 'give',
 'arsenal',
 '2',
 'goal',
 'margin',
 '78',
 'mins',
 "'",
 'already',
 'got',
 'flaky',
 'parent',
 "'",
 'snot',
 'supposed',
 'child',
 "'",
 'job',
 'support',
 'parentnot',
 "'",
 'ride',
 'age',
 'anyway',
 "'",
 'supposed',
 'support',
 "'",
 'hurt',
 'unintentional',
 'hurt',
 'nonetheless',

 'meet',
 '4',
 'dinner',
 'cos',
 'later',
 'u',
 'leave',
 'xy',
 'feel',
 'awkward',
 'den',
 'u',
 'meet',
 '4',
 'lunch',
 'lor',
 'spook',
 'mob',
 'halloween',
 'collection',
 'logo',
 'pic',
 'message',
 'plus',
 'free',
 'eerie',
 'tone',
 'txt',
 'card',
 'spook',
 '8007',
 'zed',
 '08701417012150p',
 'per',
 'logopic',
 'like',
 'cheap',
 'ium',
 'happy',
 'splash',
 'wine',
 'makes',
 'feel',
 'better',
 'shes',
 'fine',
 'difficulties',
 'phone',
 'works',
 'mine',
 'pls',
 'send',
 'another',
 'friend',
 'request',
 'ugh',
 'leg',
 'hurts',
 'musta',
 'overdid',
 'mon',
 'call',
 'germany',
 '1',
 'pence',
 'per',
 'minute',
 'call',
 'fixed',
 'line',
 'via',
 'access',
 'number',
 '0844',
 '861',
 '85',
 '85',
 'prepayment',
 'direct',
 'access',
 'wwwtelediscountcouk',
 '4',
 'costa',
 'del',
 'sol',
 'holiday',
 'a5000',
 'await',
 'collection',
 'call',
 '09050090044',
 'toclaim',
 'sae',
 'tc',
 'pobox334',
 'stockport',
 'sk38xh',
 'costa150pm',
 'max10mins',
 'wot

 'effect',
 'irritation',
 'ignore',
 'one',
 'think',
 'tantrum',
 "'",
 'finished',
 'yeah',
 "'",
 'point',
 'compliments',
 'away',
 'system',
 'side',
 'happened',
 'adventuring',
 'hey',
 'chief',
 'give',
 'bell',
 'get',
 'need',
 'talk',
 'royal',
 'visit',
 '1st',
 'june',
 'ok',
 'another',
 'number',
 'know',
 'thinkin',
 'malaria',
 'relax',
 'children',
 'cant',
 'handle',
 'malaria',
 'would',
 'worse',
 'gastroenteritis',
 'takes',
 'enough',
 'replace',
 'loss',
 'temp',
 'reduce',
 'give',
 'malaria',
 'meds',
 'vomit',
 'self',
 'limiting',
 'illness',
 'means',
 'days',
 'completely',
 'stop',
 'aiyah',
 'ok',
 'wat',
 'long',
 'got',
 'improve',
 'already',
 'wat',
 'want',
 'explicit',
 'sex',
 '30',
 'secs',
 'ring',
 '02073162414',
 'costs',
 '20pmin',
 'gsex',
 'pobox',
 '2667',
 'wc1n',
 '3xx',
 "'",
 'believe',
 'attached',
 'seeing',
 'every',
 'day',
 'know',
 'best',
 'get',
 'babe',
 'go',
 'teach',
 'class',
 'midnight',
 'sleepingand',
 'surfing',
 'ask

In [19]:
bow = pd.DataFrame(bag_of_words.todense(), columns=cv.get_feature_names_out())

In [20]:
bow

Unnamed: 0,008704050406,0089my,0121,01223585236,01223585334,0125698789,02,020603,0207,02070836089,...,zebra,zed,zeros,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52755,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52756,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52757,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
52758,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
bow.apply(lambda row: row / row.sum(), axis=1)

Unnamed: 0,008704050406,0089my,0121,01223585236,01223585334,0125698789,02,020603,0207,02070836089,...,zebra,zed,zeros,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52755,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52757,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52758,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
bag_of_words = tfidf.fit_transform(words)

In [23]:
pd.DataFrame(bag_of_words.todense(), columns=tfidf.get_feature_names_out())

Unnamed: 0,008704050406,0089my,0121,01223585236,01223585334,0125698789,02,020603,0207,02070836089,...,zebra,zed,zeros,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52755,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52757,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52758,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
pd.Series(dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))).sort_values()

call         5.515685
get          5.915103
ur           5.920284
go           6.242316
ok           6.245906
              ...    
havn        11.180380
haventcn    11.180380
hava        11.180380
healthy     11.180380
zyada       11.180380
Length: 9319, dtype: float64

In [34]:
# Model
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [35]:
df.head()

Unnamed: 0_level_0,label,original,clean,stemmed,lemmatized
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,go jurong point crazi avail onli bugi n great ...,go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,ok lar joke wif u oni,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,free entri 2 wkli comp win fa cup final tkt 21...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say,u dun say earli hor u c alreadi say,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah ' think goes usf lives around though,nah ' think goe usf live around though,nah ' think go usf life around though


In [36]:
x = df.clean
y = df.label

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .2, random_state = 123)

In [37]:
cv = CountVectorizer()
x_bow = cv.fit_transform(x_train)
tree = DecisionTreeClassifier(max_depth=5)
tree.fit(x_bow, y_train)

tree.score(x_bow, y_train)

0.9273053623513574

In [38]:
dfidf = TfidfVectorizer()
x_tfidf = tfidf.fit_transform(x_train)
tree.fit(x_tfidf, y_train)
tree.score(x_tfidf, y_train)

0.9517612743998205

In [41]:
pd.Series(dict(zip(cv.get_feature_names_out(), tree.feature_importances_))).sort_values().tail(15)

stop       0.002599
hates      0.002671
mobile     0.002815
quite      0.002876
18         0.003971
oh         0.005283
address    0.005382
service    0.008589
hi         0.012790
message    0.013126
86688      0.029548
reply      0.071883
text       0.109761
txt        0.262313
call       0.466394
dtype: float64

In [42]:
x_train.head()

id
385                                   took mr owl 3 licks
4003    well ' pattern emerging friends telling drive ...
1283                                   yes thought thanks
2327    urgent mobile number a2000 bonus caller prize ...
1103    aiyah sorry lor watch tv watch forgot 2 check ...
Name: clean, dtype: object

In [43]:
y_train.head()

id
385      ham
4003     ham
1283     ham
2327    spam
1103     ham
Name: label, dtype: object

In [44]:
y_train.value_counts()

ham     3843
spam     614
Name: label, dtype: int64

In [46]:
# Bigram modeling
cv = CountVectorizer(ngram_range=(2,2))
x_bow = cv.fit_transform(x_train)
tree = DecisionTreeClassifier(max_depth=5)
tree.fit(x_bow, y_train)
tree.score(x_bow, y_train)

0.8871438187121382

In [47]:
# 
tfidf = TfidfVectorizer(ngram_range=(2,2))
x_tfidf = tfidf.fit_transform(x_train)
tree.fit(x_tfidf, y_train)
tree.score(x_tfidf, y_train)

0.8871438187121382

In [48]:
cv = CountVectorizer(ngram_range=(1,2))
x_bow = cv.fit_transform(x_train)
tree = DecisionTreeClassifier(max_depth = 5)
tree.fit(x_bow, y_train)
tree.score(x_bow, y_train)

0.9273053623513574