In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
#1.讀入 spam.csv
#資料來源:https://www.kaggle.com/uciml/sms-spam-collection-dataset
filepath='spam.csv'
data_rawSMS = pd.read_csv(filepath,usecols=[0,1],encoding='latin-1')
data_rawSMS.columns=['label','content']
data_rawSMS

Unnamed: 0,label,content
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
#2.計算 tfidf分數，取字詞特徵
def generate_key_list(size_table=200,ignore=3):
    dict_spam_raw = dict()
    dict_ham_raw = dict()
    dict_IDF = dict()
    
    # ignore all other than letters.
    for i in range(data_rawSMS.shape[0]):
        finds = re.findall('[A-Za-z]+', data_rawSMS.iloc[i].content)
        if data_rawSMS.iloc[i].label == 'spam':
            for find in finds:
                if len(find)<ignore: continue
                    
                find = find.lower()
                try:
                    dict_spam_raw[find] = dict_spam_raw[find] + 1
                except:
                    dict_spam_raw[find] = dict_spam_raw.get(find,1)
                    dict_ham_raw[find] = dict_ham_raw.get(find,0)
        else:
            for find in finds:
                if len(find)<ignore: continue
                    
                find = find.lower()
                try:
                    dict_ham_raw[find] = dict_ham_raw[find] + 1
                except:
                    dict_ham_raw[find] = dict_ham_raw.get(find,1)
                    dict_spam_raw[find] = dict_spam_raw.get(find,0)

        word_set = set()
        for find in finds:
            if len(find)<ignore: continue
                
            find = find.lower()
            if not(find in word_set):
                try:
                    dict_IDF[find] = dict_IDF[find] + 1
                except:
                    dict_IDF[find] = dict_IDF.get(find,1)
            word_set.add(find)
    
    word_df = pd.DataFrame(list(zip(dict_ham_raw.keys(),
            dict_ham_raw.values(),dict_spam_raw.values(),dict_IDF.values())))
    word_df.columns = ['keyword','ham','spam','IDF']
    
    word_df['ham'] = word_df['ham'].astype('float') / data_rawSMS[data_rawSMS['label']=='ham'].shape[0]
    word_df['spam'] = word_df['spam'].astype('float') / data_rawSMS[data_rawSMS['label']=='spam'].shape[0]
    word_df['IDF'] = np.log10(word_df.shape[0] / word_df['IDF'].astype('float'))
    word_df['ham_IDF'] = word_df['ham'] * word_df['IDF']
    word_df['spam_IDF'] = word_df['spam'] * word_df['IDF']
    word_df['diff']=word_df['spam_IDF'] - word_df['ham_IDF']
    
    selected_spam_key = word_df.sort_values('diff',ascending=False)
    keyword_dict = dict()
    i = 0
    for word in selected_spam_key.head(size_table).keyword:
        keyword_dict.update({word.strip():i})
        i+=1
    return keyword_dict

In [4]:
#取 300個特徵，忽略 3個字數的單字
size_table = 300
word_len_ignored = 3
keyword_dict=generate_key_list(size_table, word_len_ignored)

In [5]:
#3.資料轉成特徵向量
def convert_Content(content, keyword):
    m = len(keyword)
    res = np.int_(np.zeros(m))
    finds = re.findall('[A-Za-z]+', content)
    for find in finds:
        find=find.lower()
        try:
            i = keyword[find]
            res[i]=1
        except:
            continue
    return res

def raw2feature(data,keyword):
    n_train = data.shape[0]
    m = len(keyword)
    x_data = np.zeros((n_train,m))
    y_data = np.int_(data.label=='spam')
    for i in range(n_train):
        x_data[i,:] = convert_Content(data.iloc[i].content, keyword)
    
    return [x_data,y_data]

In [6]:
data_rawSMS=raw2feature(data_rawSMS,keyword_dict)

In [7]:
print(len(data_rawSMS))
print(data_rawSMS[0])
print(data_rawSMS[0].shape)
print(data_rawSMS[1])
print(data_rawSMS[1].shape)

2
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(5572, 300)
[0 0 1 ... 0 0 0]
(5572,)


In [8]:
print(len(data_rawSMS))
print(data_rawSMS[0][2])
print(data_rawSMS[1][2])

2
[0. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
1


In [9]:
df_sms = pd.DataFrame(data_rawSMS[0],columns=['V'+str(x) for x in range(300)])
df_sms['label']=data_rawSMS[1]
df_sms

Unnamed: 0,V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V291,V292,V293,V294,V295,V296,V297,V298,V299,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
5568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
5570,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [10]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler

In [11]:
spark = SparkSession.builder \
        .appName('sms') \
        .master('local') \
        .getOrCreate()

In [12]:
df_sms = spark.createDataFrame(df_sms)
df_sms.printSchema()

root
 |-- V0: double (nullable = true)
 |-- V1: double (nullable = true)
 |-- V2: double (nullable = true)
 |-- V3: double (nullable = true)
 |-- V4: double (nullable = true)
 |-- V5: double (nullable = true)
 |-- V6: double (nullable = true)
 |-- V7: double (nullable = true)
 |-- V8: double (nullable = true)
 |-- V9: double (nullable = true)
 |-- V10: double (nullable = true)
 |-- V11: double (nullable = true)
 |-- V12: double (nullable = true)
 |-- V13: double (nullable = true)
 |-- V14: double (nullable = true)
 |-- V15: double (nullable = true)
 |-- V16: double (nullable = true)
 |-- V17: double (nullable = true)
 |-- V18: double (nullable = true)
 |-- V19: double (nullable = true)
 |-- V20: double (nullable = true)
 |-- V21: double (nullable = true)
 |-- V22: double (nullable = true)
 |-- V23: double (nullable = true)
 |-- V24: double (nullable = true)
 |-- V25: double (nullable = true)
 |-- V26: double (nullable = true)
 |-- V27: double (nullable = true)
 |-- V28: double (nullabl

In [13]:
feature = VectorAssembler(inputCols=df_sms.columns[:-1],outputCol="features")
feature_vector= feature.transform(df_sms)

In [14]:
(trainData, testData) = feature_vector.randomSplit([0.8, 0.2],seed = 42)

In [15]:
#4.訓練分類器
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(labelCol="label", featuresCol="features")
nb_model = nb.fit(trainData)
nb_prediction = nb_model.transform(testData)
nb_prediction.select("prediction", "label", "features").show()

+----------+-----+-----------+
|prediction|label|   features|
+----------+-----+-----------+
|       0.0|    0|(300,[],[])|
|       0.0|    0|(300,[],[])|
|       0.0|    0|(300,[],[])|
|       0.0|    0|(300,[],[])|
|       0.0|    0|(300,[],[])|
|       0.0|    0|(300,[],[])|
|       0.0|    0|(300,[],[])|
|       0.0|    0|(300,[],[])|
|       0.0|    0|(300,[],[])|
|       0.0|    0|(300,[],[])|
|       0.0|    0|(300,[],[])|
|       0.0|    0|(300,[],[])|
|       0.0|    0|(300,[],[])|
|       0.0|    0|(300,[],[])|
|       0.0|    0|(300,[],[])|
|       0.0|    0|(300,[],[])|
|       0.0|    0|(300,[],[])|
|       0.0|    0|(300,[],[])|
|       0.0|    0|(300,[],[])|
|       0.0|    0|(300,[],[])|
+----------+-----+-----------+
only showing top 20 rows



In [16]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

In [17]:
nb_accuracy = evaluator.evaluate(nb_prediction)
print("Accuracy of NaiveBayes is  = %g"% (nb_accuracy))
print("Test Error of NaiveBayes  = %g " % (1.0 - nb_accuracy))

Accuracy of NaiveBayes is  = 0.968313
Test Error of NaiveBayes  = 0.0316869 
