In [4]:
import pandas as pd
import numpy as np

In [5]:
ds = pd.read_csv("smsspamcollection/SMSSpamCollection", 
                 delimiter="\t", names=("is_positive", "text"))

In [6]:
ds.head()

Unnamed: 0,is_positive,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
for index, row in ds.iterrows():
    if row[0] == "ham":
        row[0] = 0
    else:
        row[0] = 1

In [8]:
ds.is_positive.value_counts()

0    4825
1     747
Name: is_positive, dtype: int64

In [13]:
texts = []
labels = []
for index, row in ds.iterrows():
    labels.append(row[0])
    texts.append(row[1])

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
vectorizer = CountVectorizer()#ngram_range=(1,2))
X = vectorizer.fit_transform(texts)

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [17]:
cls = LogisticRegression()
res = cross_val_score(cls, X, labels, scoring="f1", cv=10)
print(np.mean(res))

0.932640298361


In [18]:
cls.fit(X, labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [19]:
messages = ["FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! Subscribe6GB", "FreeMsg: Txt: claim your reward of 3 hours talk time", "Have you visited the last lecture on physics?", "Have you visited the last lecture on physics? Just buy this book and you will have all materials! Only 99$", "Only 99$"]
sample = vectorizer.transform(messages)
cls.predict(sample)

array([1, 1, 0, 0, 0])

Классификатор предсказал: 1 - спам, 2 - спам, 3 - не спам, 4 - не спам, 5 - не спам

In [22]:
def get_accuracy(estimator,ngram1,ngram2):
    vectorizer = CountVectorizer(ngram_range=(ngram1,ngram2))
    X = vectorizer.fit_transform(texts)
    res = cross_val_score(estimator, X, labels, scoring="f1", cv=10)
    print("{:.2f}".format(np.mean(res)))

In [23]:
get_accuracy(LogisticRegression(),3,3)
get_accuracy(LogisticRegression(),1,3)
get_accuracy(LogisticRegression(),2,2)

0.73
0.93
0.82


In [25]:
from sklearn.naive_bayes import MultinomialNB
get_accuracy(MultinomialNB(),3,3)
get_accuracy(MultinomialNB(),1,3)
get_accuracy(MultinomialNB(),2,2)

0.38
0.89
0.65


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(ds.text)

In [28]:
cls = LogisticRegression()
res = cross_val_score(cls, X_tfidf, labels, scoring="f1", cv=10)
print(np.mean(res))

0.852859955417


Вывод: Качество на кросс-валидации по сравнению с CountVectorizer на униграммах понизилось. Это можно объяснить тем, что признаки tf * idf учитывают значимость конкретного слова в данном документе, а CountVectorizer учитывает частоту употребления данного слова во всех документах. Конкретно для классификации на спам/не спам это оказалось важным фактором.