In [1]:
import pandas as pd
import pickle

In [2]:
df = pd.read_csv('SMSSpamCollection',sep='\t',header=None)

In [3]:
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.columns=['Label','Text']

In [5]:
df.head()

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Data cleaning & Text pre processing

In [6]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Error loading stopwords: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


In [7]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [8]:
stopwords_list = ['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'only',
 'own',
 'same',
 'so',
 'than',
 'too',
 'very',
 's',
 't',
 'can',
 'will',
 'just',
 'should',
 "should've",
 'now',
 'd',
 'll',
 'm',
 'o',
 're',
 've',
 'y',
 'ain',
 'aren',
 'ma',
 'mightn',
 "mightn't",
 'mustn',
 "mustn't",
 'needn',
 "needn't",
 'shan',
 "shan't",
 'shouldn',
 "shouldn't",
 'wasn',
 "wasn't",
 'weren',
 "weren't",
 'won',
 "won't",
 'wouldn',
 "wouldn't"]

In [9]:
from nltk.stem.porter import PorterStemmer

In [10]:
ps = PorterStemmer()

In [11]:
corpus = []
for i in range(0,len(df['Text'])):
    review = re.sub('[^a-zA-Z0-9]',' ',df['Text'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stopwords_list]
    review = ' '.join(review)
    corpus.append(review)

In [12]:
len(corpus)

5572

# Creating BOW and TF -IDF

In [13]:
## independet features and dependent

y = pd.get_dummies(df['Label'],drop_first=True)

In [14]:
y.head()

Unnamed: 0,spam
0,0
1,0
2,1
3,0
4,0


In [15]:
# train test 

from sklearn.model_selection import train_test_split

In [16]:
X_train,X_test,y_train,y_test = train_test_split(corpus,y,test_size=0.2,random_state=42)

In [17]:
X_train

['repli win 100 weekli 2006 fifa world cup held send stop 87239 end servic',
 'hello sort town alreadi dont rush home eat nacho let know eta',
 'come guoyang go n tell u told',
 'hey sathya till dint meet not even singl time saw situat sathya',
 'orang bring rington time chart hero free hit week go rington pic wap stop receiv tip repli stop',
 'sit mu wait everyon get suit take shower',
 'finish liao u',
 'urgent mobil no 07808726822 award 2 000 bonu caller prize 02 09 03 2nd attempt contact call 0871 872 9758 box95qu',
 'probabl not still go stuff',
 'wah lucki man save money hee',
 'hey u still gym',
 'oh lk tt den take e one tt end cine lor dun wan yogasana oso',
 'ok lor',
 'still havent collect dough pl let know go place sent get control number',
 'stupid not possibl',
 'u secret admir reveal think u r special call 09065174042 opt repli reveal stop 1 50 per msg recd cust care 07821230901',
 'amaz rearrang letter give mean dormitori dirti room astronom moon starer eye see elect res

In [18]:
X_test

['squeeeeez christma hug u lik frndshp den hug back u get 3 u r cute 6 u r luvd 9 u r lucki none peopl hate u',
 'also sorta blown coupl time recent id rather not text blue look weed',
 'mmm that better got roast b better drink 2 good indian',
 'mm kanji dont eat anyth heavi ok',
 'ring come guy costum gift futur yowif hint hint',
 'sari need tim bollox hurt lot tol',
 'love isn decis feel could decid love life would much simpler less magic',
 'supervisor find 4 one lor thk student havent ask yet tell u aft ask',
 'dear good morn',
 'chennai velacheri',
 'lol grr mom take forev prescript pharmaci like 2 minut away ugh',
 'no valentin huh proof fb page ugh glad realli didn watch rupaul show tool',
 'wif buy tix lar',
 'er hello thing didn quit go plan limp slowli home follow aa exhaust hang',
 'free rington text first 87131 poli text get 87131 true tone help 0845 2814032 16 1st free tone 3x 150pw e nd txt stop',
 'sir wait call',
 'crazi ar marri like gd look guy not fren like say korea

In [19]:
y.value_counts()

spam
0       4825
1        747
dtype: int64

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X_train = cv.fit_transform(X_train).toarray()

In [21]:
X_test = cv.transform(X_test).toarray()

In [22]:
X_test.shape

(1115, 2500)

In [23]:
cv.vocabulary_

{'repli': 1804,
 'win': 2416,
 '100': 43,
 'weekli': 2387,
 'world': 2450,
 'cup': 656,
 'held': 1023,
 'send': 1884,
 'stop': 2082,
 '87239': 195,
 'end': 792,
 'servic': 1893,
 'hello': 1027,
 'sort': 2023,
 'town': 2254,
 'alreadi': 267,
 'dont': 750,
 'rush': 1843,
 'home': 1057,
 'eat': 777,
 'let': 1309,
 'know': 1258,
 'come': 588,
 'go': 958,
 'tell': 2183,
 'told': 2237,
 'hey': 1033,
 'till': 2226,
 'dint': 728,
 'meet': 1449,
 'not': 1569,
 'even': 814,
 'singl': 1949,
 'time': 2227,
 'saw': 1858,
 'situat': 1957,
 'orang': 1611,
 'bring': 456,
 'rington': 1823,
 'chart': 540,
 'hero': 1032,
 'free': 907,
 'hit': 1041,
 'week': 2385,
 'pic': 1662,
 'wap': 2369,
 'receiv': 1778,
 'sit': 1954,
 'mu': 1520,
 'wait': 2358,
 'everyon': 818,
 'get': 946,
 'suit': 2111,
 'take': 2154,
 'shower': 1929,
 'finish': 873,
 'liao': 1311,
 'urgent': 2312,
 'mobil': 1487,
 'no': 1558,
 'award': 355,
 '000': 1,
 'bonu': 431,
 'caller': 489,
 'prize': 1723,
 '02': 2,
 '03': 4,
 '2nd': 112,
 

In [24]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
Classifier = RandomForestClassifier().fit(X_train,y_train)

  Classifier = RandomForestClassifier().fit(X_train,y_train)


In [26]:
y_pred = Classifier.predict(X_test)

In [27]:
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

In [28]:
confusion_matrix(y_test,y_pred)

array([[966,   0],
       [ 18, 131]], dtype=int64)

In [29]:
accuracy_score(y_test,y_pred)

0.9838565022421525

In [30]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.88      0.94       149

    accuracy                           0.98      1115
   macro avg       0.99      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [31]:
pickle.dump(cv,open('vectorize.pkl','wb'))

In [32]:
pickle.dump(Classifier,open('model.pkl','wb'))

In [33]:
text = pickle.load(open('vectorize.pkl','rb'))

In [37]:
vec = text.transform(["'Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's'"]).toarray()

In [38]:
model = pickle.load(open('model.pkl','rb'))

In [40]:
model.predict(vec)[0]

1