In [126]:
## ref: https://zhuanlan.zhihu.com/p/28059124

In [127]:
import pandas as pd
import numpy as np
import jieba

## load data

In [58]:
data = pd.read_csv(r"https://raw.githubusercontent.com/wandouqiang/RubbishMessage/master/data/80w.txt",encoding='utf-8',sep='\t',header=None)

In [59]:
## 最后一列为短信的内容，倒数第二列则是短信的类型，0表示正常短信，1表示垃圾短信。

In [60]:
data.head()

Unnamed: 0,0,1,2
0,1,0,商业秘密的秘密性那是维系其商业价值和垄断地位的前提条件之一
1,2,1,南口阿玛施新春第一批限量春装到店啦   春暖花开淑女裙、冰蓝色公主衫 ...
2,3,0,带给我们大常州一场壮观的视觉盛宴
3,4,0,有原因不明的泌尿系统结石等
4,5,0,23年从盐城拉回来的麻麻的嫁妆


In [61]:
data.shape

(754843, 3)

## preprocessing

In [41]:
import jieba
spam = data[data[1] == 1]
spam[2] = spam[2].map(lambda x:' '.join(jieba.cut(x)))
spam.head()
# 正常短信
normal = data[data[1] == 0]
normal[2] = normal[2].map(lambda x:' '.join(jieba.cut(x)))
normal.head()

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.147 seconds.
Prefix dict has been built succesfully.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Unnamed: 0,0,1,2
0,1,0,商业秘密 的 秘密性 那 是 维系 其 商业价值 和 垄断 地位 的 前提条件 之一
2,3,0,带给 我们 大 常州 一场 壮观 的 视觉 盛宴
3,4,0,有 原因 不明 的 泌尿系统 结石 等
4,5,0,23 年 从 盐城 拉回来 的 麻麻 的 嫁妆
5,6,0,感到 自 减肥 、 跳 减肥 健美操 、


In [42]:
spam.to_csv('spam.csv',encoding='utf-8',header=False,index=False,columns=[2])
normal.to_csv('normal.csv',encoding='utf-8',header=False,index=False,columns=[2])

## training

In [2]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import PlaintextCorpusReader
import random

In [3]:
message_corpus = PlaintextCorpusReader('./',['spam.csv','normal.csv'])
all_message = message_corpus.words()

In [4]:
def massage_feature(word,num_letter=1):
    return {'feature':word[-num_letter:]}

In [5]:
labels_name = ([(massage,'垃圾') for massage in message_corpus.words('spam.csv')]+[(massage,'正常') for massage in message_corpus.words('normal.csv')])
random.seed(7)
random.shuffle(labels_name)

In [None]:
from nltk.classify import accuracy as nltk_accuracy
featuresets = [(massage_feature(n),massage) for (n,massage) in labels_name]
train_set,test_set = featuresets[2000:],featuresets[:2000]
classifier = NaiveBayesClassifier.train(train_set)

In [None]:
print('结果准确率：',str(100*nltk_accuracy(classifier,test_set))+str('%'))

## implementation using scikit-learn

In [128]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score

In [129]:
data = pd.read_csv(r"https://raw.githubusercontent.com/wandouqiang/RubbishMessage/master/data/80w.txt",encoding='utf-8',sep='\t',header=None, nrows=10000)

In [130]:
data.shape

(10000, 3)

In [131]:
target, corpus = data[1], data[2].map(lambda x:' '.join(jieba.cut(x)))

In [133]:
cv=CountVectorizer(analyzer='word', min_df = 3)
transformer=TfidfTransformer()
corpus=transformer.fit_transform(cv.fit_transform(corpus))

In [134]:
corpus.shape

(10000, 5343)

In [135]:
## target imbalanced
np.sum(target.values) * 1.0 / 10000

0.096600000000000005

In [136]:
X_train, X_test, y_train, y_test = train_test_split(corpus.toarray(), target)

In [137]:
nb = GaussianNB()

In [138]:
nb.fit(X_train, y_train)

GaussianNB(priors=None)

In [139]:
accuracy_score(y_test, nb.predict(X_test))  ## accuracy is meaningless

0.86639999999999995

In [140]:
roc_auc_score(y_test, nb.predict(X_test))

0.81616458046355633