In [0]:
!pip install rasa

In [0]:
import io
import regex as re
import nltk
from zipfile import ZipFile
from urllib.request import urlopen
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.cluster import KMeans

In [0]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'

In [3]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
stoplist = set(stopwords.words('english'))

In [5]:
resp = urlopen(url)
zipfile = ZipFile(io.BytesIO(resp.read()))
zipfile.namelist()

['SMSSpamCollection', 'readme']

In [0]:
smspam = pd.read_csv(zipfile.open('SMSSpamCollection'),sep = '\t',header=None)

In [16]:
smspam

Unnamed: 0,0,1,processd
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,nd time tried contact u u pound prize claim ea...
5568,ham,Will ü b going to esplanade fr home?,b going esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",pity mood suggestions
5570,ham,The guy did some bitching but I acted like i'd...,guy bitching acted like interested buying some...


In [0]:
def prepipe(text):
  text=re.sub('[^a-zA-Z]', " ", str(text))
  text=text.lower()
  data = ' '.join([w for w in word_tokenize(text) if w not in stoplist])
  return data

In [0]:
smspam['processd'] = smspam[1].apply(prepipe)

In [9]:
smspam['processd']

0       go jurong point crazy available bugis n great ...
1                                 ok lar joking wif u oni
2       free entry wkly comp win fa cup final tkts st ...
3                     u dun say early hor u c already say
4                  nah think goes usf lives around though
                              ...                        
5567    nd time tried contact u u pound prize claim ea...
5568                            b going esplanade fr home
5569                                pity mood suggestions
5570    guy bitching acted like interested buying some...
5571                                       rofl true name
Name: processd, Length: 5572, dtype: object

In [0]:
rawcorpus = ['{}'.format(i) for i in smspam['processd']]

In [0]:
rawcorpus

In [0]:
vectorizer = TfidfVectorizer()

In [0]:
X = vectorizer.fit_transform(rawcorpus)

In [0]:
ft_names = vectorizer.get_feature_names()

In [0]:
dense = X.todense().tolist()

In [0]:
df = pd.DataFrame(dense,columns=ft_names)

In [0]:
dframe = pd.concat([df,smspam[0]],1,sort=False)

In [0]:
dframe = dframe.rename(columns = {0 :"Class"},errors = 'raise')

In [0]:
rfc = RandomForestClassifier()
X = dframe.drop(columns=['Class'])
y = dframe["Class"]

In [0]:
X_train,X_test,y_train,y_test = tts(X,y,test_size = 0.3)  

In [0]:
rfc.fit(X_train,y_train)
y_pred = rfc.predict(X_test)

In [44]:
print(classification_report(y_test,y_pred))
accuracy_score(y_test,y_pred)

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1449
        spam       1.00      0.78      0.87       223

    accuracy                           0.97      1672
   macro avg       0.98      0.89      0.93      1672
weighted avg       0.97      0.97      0.97      1672



0.9700956937799043

In [45]:
print(rfc.predict(X.iloc[[1]]))

['ham']


In [0]:
unseen = vectorizer.transform(['This is the 2nd time we have tried 2 contact u. U have won the £750 Pound prize. 2 claim is easy, call 087187272008 NOW1! Only 10p per minute. BT-national-rate.'])

In [47]:
rfc.fit(X,smspam[0]).predict(unseen)

array(['spam'], dtype=object)