In [1]:
import pandas as pd
import numpy as np
import nltk
import sklearn

## 1. Load the Dataset

In [22]:
# load the dataset of sms messages
df = pd.read_table("SMSSpamCollection", header=None, encoding="utf-8")

In [23]:
# print useful information about the dataset

print(df.info())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       5572 non-null   object
 1   1       5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None
           0                       1
count   5572                    5572
unique     2                    5169
top      ham  Sorry, I'll call later
freq    4825                      30


In [24]:
# check the class distribution
classes = df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


## 2. Preprocess the Data

In [25]:
# переведем названия классов в бинарный формат: 0 = ham, 1 = spam

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
Y = encoder.fit_transform(classes)

# преобразуем в словарь, чтобы убедиться в однозначном соответствии меток классов и их начальных подписей
print(dict(zip(df.iloc[:,0][:10], Y[:10])))  

{'ham': 0, 'spam': 1}


In [26]:
df.loc[5572] = "spam", "Girls around you want to have a sex. Follow the link to see Yilia’s message 'Honey, let’s go out and...'"
df.loc[5573] = "spam", "Play chess with world champions! Use all the potential of the app and take a chance to play with best world players! Follow the link to"
df.loc[5574] = "spam", "'Pikelny club fans to tickle point' Hello! Last month, you purchased goods for the amount of ₽5648.73, accumulated 89 points. Have time to use them on black Friday!"
df.tail()

Unnamed: 0,0,1
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name
5572,spam,Girls around you want to have a sex. Follow th...
5573,spam,Play chess with world champions! Use all the p...
5574,spam,'Pikelny club fans to tickle point' Hello! Las...


Теперь необходимо избавиться от ненужной информации. Что она из себя представляет?
Например, почтовые адреса. Классификатору не важно, какая именно электронная почта указана в письме; важно, что она просто есть. Если мы оставим адреса в письмах (адреса, вероятнее всего, у всех разные), мы будем обладать теми признаками каждого отдельного письма, которые не присущи всем остальным. Это нам ничего не даст - напротив, лучше те места, которые различны у всех, сделать одинаковыми, чтобы не при обучении не отвлекаться на неиформативные отличия.
Примерами подобных 'лишних' элементов в письмах помимо email-адресов являются также web-адреса, номера телефонов, знаки валют, знаки пунктуации. Будем заменять их конкретные значения на обобщающие слова (например, **'svolkov@mail.ru'** - **'emailaddress'**) с помощью регулярных выражений.

In [27]:
text_messages = df[1]
print(text_messages[:5])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: 1, dtype: object


In [28]:
import re

processed = text_messages
for i in range(len(processed)):
    # почта
    processed[i] = re.sub(r'^.+@[^\.].*\.[a-z]{2,}$', 'emailaddress', processed[i])
    # URL
    processed[i] = re.sub(r'^http\://[a-zA-Z8-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'webaddress', processed[i])
    # денежные символы
    processed[i] = re.sub(r'[$€£₽]', 'moneysym', processed[i])
    # номера телефонов
    processed[i] = re.sub(r'[+]{,1}[(0-9)-]{11,17}', 'phonenum', processed[i])
    # знаки пунктуации
    processed[i] = re.sub(r'[^\w\s\d]', '', processed[i])
    # пробелы между токенами
    processed[i] = re.sub(r'\s+', ' ', processed[i])
    # пробелы, символы табуляции
    processed[i] = processed[i].strip()
#     замена цифр
#     processed[i] = re.sub(r'\d+(\.\d+)?', 'numbr', processed[i])
    

In [29]:
processed = processed.str.lower()
print(processed.head())

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in 2 a wkly comp to win fa cup fina...
3          u dun say so early hor u c already then say
4    nah i dont think he goes to usf he lives aroun...
Name: 1, dtype: object


In [30]:
from nltk.corpus import stopwords

# удаляем стоп-слова
stop_words = set(stopwords.words("english"))
processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [31]:
# находим основы слов (осуществим стемминг методом Портера)

ps = nltk.PorterStemmer()
processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))
processed

0       go jurong point crazi avail bugi n great world...
1                                   ok lar joke wif u oni
2       free entri 2 wkli comp win fa cup final tkt 21...
3                     u dun say earli hor u c alreadi say
4               nah dont think goe usf live around though
                              ...                        
5570    guy bitch act like id interest buy someth els ...
5571                                       rofl true name
5572    girl around want sex follow link see yilia mes...
5573    play chess world champion use potenti app take...
5574    pikelni club fan tickl point hello last month ...
Name: 1, Length: 5575, dtype: object

In [32]:
from nltk.tokenize import word_tokenize

all_words = []
for message in processed:
    words = word_tokenize(message)
    for word in words:
        all_words.append(word)
    
    
all_words = nltk.FreqDist(all_words)  # список кортежей с токенами и их частотами

In [33]:
len(all_words)

7897

In [34]:
# определим клюяевые признаки, по которым будем определять принадлежность к классу

word_features = [i[0] for i in all_words.most_common(1500)]
# word_features[-1]
word_features

['u',
 'call',
 '2',
 'im',
 'go',
 'get',
 'phonenum',
 'ur',
 'come',
 'dont',
 '4',
 'ok',
 'ltgt',
 'free',
 'know',
 'like',
 'got',
 'love',
 'want',
 'ill',
 'day',
 'time',
 'good',
 'text',
 'send',
 'need',
 'one',
 'txt',
 'see',
 'today',
 'ü',
 'think',
 'home',
 'take',
 'lor',
 'stop',
 'repli',
 'tell',
 'sorri',
 'still',
 'r',
 'back',
 'mobil',
 'make',
 'n',
 'phone',
 'say',
 'new',
 'work',
 'pleas',
 'well',
 'week',
 'later',
 'hi',
 'da',
 'ask',
 'miss',
 'cant',
 'hope',
 'meet',
 'happi',
 'night',
 'tri',
 'give',
 'claim',
 'wait',
 'thing',
 'oh',
 'much',
 'great',
 'hey',
 'pl',
 'dear',
 'wat',
 'messag',
 'number',
 'na',
 'friend',
 'thank',
 'that',
 'way',
 'prize',
 'right',
 'feel',
 'let',
 'msg',
 'wan',
 'even',
 'pick',
 'alreadi',
 'tomorrow',
 'said',
 'ye',
 'realli',
 'yeah',
 'min',
 'e',
 'amp',
 'leav',
 'care',
 'co',
 'didnt',
 'babe',
 '1',
 'morn',
 'win',
 'last',
 'c',
 'life',
 'sure',
 'servic',
 'ive',
 'anyth',
 'would',
 'ke

In [35]:
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)
    return features

features = find_features(processed[0])
for key, value in features.items():
    if value==True:
        print(key)

go
got
n
great
wat
e
world
point
avail
crazi
bugi
la
cine


In [36]:
messages = zip(processed, Y)

# define a seed for reproducibility
seed = 1
np.random.seed = seed
np.random.shuffle(list(messages))

# call find_features function for each SMS message
featuresets = [(find_features(text), label) for (text, label) in zip(processed, Y)]

In [42]:
from sklearn import model_selection

# split the data into training and testing datasets
# training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=seed)
training, testing = featuresets[:4500], featuresets[4500:]

In [43]:
print(len(training))
print(len(testing))

4500
1072


In [44]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

model = SklearnClassifier(SVC(kernel = 'linear'))

# train the model on the training data
model.train(training)

# and test on the testing dataset!
accuracy = nltk.classify.accuracy(model, testing)*100
print("SVC Accuracy: {}".format(accuracy))

SVC Accuracy: 98.32089552238806


In [48]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print("{} Accuracy: {}".format(name, accuracy))


K Nearest Neighbors Accuracy: 93.28358208955224
Decision Tree Accuracy: 97.10820895522389
Random Forest Accuracy: 98.0410447761194
Logistic Regression Accuracy: 98.32089552238806
SGD Classifier Accuracy: 98.0410447761194
Naive Bayes Accuracy: 97.48134328358209
SVM Linear Accuracy: 98.32089552238806


In [67]:
dir(nltk_model)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_clf',
 '_encoder',
 '_make_probdist',
 '_vectorizer',
 'classify',
 'classify_many',
 'labels',
 'prob_classify',
 'prob_classify_many',
 'train']

In [69]:
nltk_model.classify(df[-1])

KeyError: -1

In [75]:
# Ensemble methods - Voting classifier
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
# accuracy = nltk.classify.accuracy(nltk_model, testing)*100
# print("Voting Classifier: Accuracy: {}".format(accuracy))