In [1]:
# -*- coding: UTF-8 -*-
import sys
from time import time
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from pyspark import SparkConf, SparkContext
from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import StandardScaler
from pyspark.mllib.evaluation import BinaryClassificationMetrics



# 初始化SC与logger

In [2]:
sparkConf = SparkConf().setAppName("NaiveBayes_Test")
sc = SparkContext(conf = sparkConf)
sc.master

'local[*]'

In [3]:
logger = sc._jvm.org.apache.log4j
logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )
logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR)

# 导入文件

In [4]:
path = 'file:///mnt/f/AI/Book/other/spark_tmp/PythonProject/'
data = sc.textFile(path+'data/train.tsv')
data.cache()

file:///mnt/f/AI/Book/other/spark_tmp/PythonProject/data/train.tsv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

user id \t movie id \t rating \t time

# 处理数据

In [5]:
# 生成特征名
header = data.first()
header = header.replace('"','').split('\t')[3:]
header

['alchemy_category',
 'alchemy_category_score',
 'avglinksize',
 'commonlinkratio_1',
 'commonlinkratio_2',
 'commonlinkratio_3',
 'commonlinkratio_4',
 'compression_ratio',
 'embed_ratio',
 'framebased',
 'frameTagRatio',
 'hasDomainLink',
 'html_ratio',
 'image_ratio',
 'is_news',
 'lengthyLinkDomain',
 'linkwordscore',
 'news_front_page',
 'non_markup_alphanum_characters',
 'numberOfLinks',
 'numwords_in_url',
 'parametrizedLinkRatio',
 'spelling_errors_ratio',
 'label']

In [6]:
# 去掉第一行的特征名
line_first = data.first()
rowdata = data.filter(lambda x:x!=line_first)
rowdata.first()

'"http://www.bloomberg.com/news/2010-12-23/ibm-predicts-holographic-calls-air-breathing-batteries-by-2015.html"\t"4042"\t"{""title"":""IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries"",""body"":""A sign stands outside the International Business Machines Corp IBM Almaden Research Center campus in San Jose California Photographer Tony Avelar Bloomberg Buildings stand at the International Business Machines Corp IBM Almaden Research Center campus in the Santa Teresa Hills of San Jose California Photographer Tony Avelar Bloomberg By 2015 your mobile phone will project a 3 D image of anyone who calls and your laptop will be powered by kinetic energy At least that s what International Business Machines Corp sees in its crystal ball The predictions are part of an annual tradition for the Armonk New York based company which surveys its 3 000 researchers to find five ideas expected to take root in the next five years IBM the world s largest 

In [7]:
# 去年数据中的 " 符号
data_train = rowdata.map(lambda x:x.replace('"','')
                        )#.map(lambda x:x.replace('?','0'))
data_train.first()

'http://www.bloomberg.com/news/2010-12-23/ibm-predicts-holographic-calls-air-breathing-batteries-by-2015.html\t4042\t{title:IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries,body:A sign stands outside the International Business Machines Corp IBM Almaden Research Center campus in San Jose California Photographer Tony Avelar Bloomberg Buildings stand at the International Business Machines Corp IBM Almaden Research Center campus in the Santa Teresa Hills of San Jose California Photographer Tony Avelar Bloomberg By 2015 your mobile phone will project a 3 D image of anyone who calls and your laptop will be powered by kinetic energy At least that s what International Business Machines Corp sees in its crystal ball The predictions are part of an annual tradition for the Armonk New York based company which surveys its 3 000 researchers to find five ideas expected to take root in the next five years IBM the world s largest provider of compute

In [8]:
# 以'\t'分隔, 并舍弃前3列, 转为np.array
data_train = data_train.map(lambda x:x.split('\t')[3:]
                           ).map(lambda x:np.array(x))
data_train.first()

array(['business', '0.789131', '2.055555556', '0.676470588', '0.205882353',
       '0.047058824', '0.023529412', '0.443783175', '0', '0', '0.09077381',
       '0', '0.245831182', '0.003883495', '1', '1', '24', '0', '5424',
       '170', '8', '0.152941176', '0.079129575', '0'], 
      dtype='<U11')

In [9]:
# 查看数据量
data_train.count()

7395

In [10]:
# 生成 lables数据, 生成 LabeledPoint时使用
labels = data_train.map(lambda x:int(x[-1]))
labels.first()

0

In [11]:
# 查看数据有多少不同的特征值, 是否含有未知值
t_size = len(data_train.first())
for i in range(t_size):
    line_dis = set(data_train.map(lambda x:x[i]).distinct().collect())
    print('特征: {2}--{0} \t 有[{1}]个取值'.format(header[i],len(line_dis),i))
    if '?' in line_dis:
        print('特征: ---{0} \t 有 ? ---------'.format(header[i]))

特征: 0--alchemy_category 	 有[14]个取值
特征: ---alchemy_category 	 有 ? ---------
特征: 1--alchemy_category_score 	 有[4806]个取值
特征: ---alchemy_category_score 	 有 ? ---------
特征: 2--avglinksize 	 有[5710]个取值
特征: 3--commonlinkratio_1 	 有[4476]个取值
特征: 4--commonlinkratio_2 	 有[4038]个取值
特征: 5--commonlinkratio_3 	 有[3266]个取值
特征: 6--commonlinkratio_4 	 有[2695]个取值
特征: 7--compression_ratio 	 有[6453]个取值
特征: 8--embed_ratio 	 有[366]个取值
特征: 9--framebased 	 有[1]个取值
特征: 10--frameTagRatio 	 有[5911]个取值
特征: 11--hasDomainLink 	 有[2]个取值
特征: 12--html_ratio 	 有[7376]个取值
特征: 13--image_ratio 	 有[5418]个取值
特征: 14--is_news 	 有[2]个取值
特征: ---is_news 	 有 ? ---------
特征: 15--lengthyLinkDomain 	 有[2]个取值
特征: 16--linkwordscore 	 有[101]个取值
特征: 17--news_front_page 	 有[3]个取值
特征: ---news_front_page 	 有 ? ---------
特征: 18--non_markup_alphanum_characters 	 有[5301]个取值
特征: 19--numberOfLinks 	 有[702]个取值
特征: 20--numwords_in_url 	 有[23]个取值
特征: 21--parametrizedLinkRatio 	 有[3922]个取值
特征: 22--spelling_errors_ratio 	 有[4219]个取值
特征: 23--label 	 

In [40]:
# 去掉未知值'?'后查看最值
def min_max(_id):
    line_dis = set(data_train.map(lambda x:x[_id]).distinct().collect())
    if '?' in line_dis:
        line_dis.remove('?')
        
    data_tmp = np.array(list(line_dis)).astype(float)
    print('min =', data_tmp.min(), end='\t')
    print('max =', data_tmp.max())

In [41]:
# 去掉未知值'?'后查看最值
t_size = len(data_train.first())
for i in range(1, t_size):
    print('特征: {0}--{1}'.format(i,header[i]),)
    min_max(i)

特征: 1--alchemy_category_score
min = 0.0708333	max = 0.999426
特征: 2--avglinksize
min = 0.0	max = 363.0
特征: 3--commonlinkratio_1
min = 0.0	max = 1.0
特征: 4--commonlinkratio_2
min = 0.0	max = 1.0
特征: 5--commonlinkratio_3
min = 0.0	max = 0.980392157
特征: 6--commonlinkratio_4
min = 0.0	max = 0.980392157
特征: 7--compression_ratio
min = 0.0	max = 21.0
特征: 8--embed_ratio
min = -1.0	max = 0.25
特征: 9--framebased
min = 0.0	max = 0.0
特征: 10--frameTagRatio
min = 0.0	max = 0.444444444
特征: 11--hasDomainLink
min = 0.0	max = 1.0
特征: 12--html_ratio
min = 0.045564223	max = 0.716883117
特征: 13--image_ratio
min = -1.0	max = 113.3333333
特征: 14--is_news
min = 1.0	max = 1.0
特征: 15--lengthyLinkDomain
min = 0.0	max = 1.0
特征: 16--linkwordscore
min = 0.0	max = 100.0
特征: 17--news_front_page
min = 0.0	max = 1.0
特征: 18--non_markup_alphanum_characters
min = 0.0	max = 207952.0
特征: 19--numberOfLinks
min = 1.0	max = 4997.0
特征: 20--numwords_in_url
min = 0.0	max = 22.0
特征: 21--parametrizedLinkRatio
min = 0.0	max = 1.0
特征: 22-

In [43]:
# 定义处理数值型数据函数, 
# 把 '?' 转为 '0', 并将数据类型转为float
def extract_num_features(data, _id, _val,):
    if data[_id] == _val: data[_id]=0
    #data_tmp = np.array(data).astype(np.float)
    return data.astype(np.float) + 1

In [44]:
num_id = [1,2,3,4,5,6,7,8, 10, 12,13, 16, 18,19, 21,22]
# 取出数值型数据
num_features = data_train.map(lambda x:x[num_id])
# 处理数值型数据
num_features = num_features.map(lambda x:extract_num_features(x,0,'?'))
num_features.first()

array([  1.78913100e+00,   3.05555556e+00,   1.67647059e+00,
         1.20588235e+00,   1.04705882e+00,   1.02352941e+00,
         1.44378318e+00,   1.00000000e+00,   1.09077381e+00,
         1.24583118e+00,   1.00388349e+00,   2.50000000e+01,
         5.42500000e+03,   1.71000000e+02,   1.15294118e+00,
         1.07912958e+00])

withMean=False

In [45]:
# 去均值和方差再归一化
std_scaler = StandardScaler(withMean=False, withStd=True
                           ).fit(num_features)
num_features = std_scaler.transform(num_features
                                   ).map(lambda x:np.array(x))
num_features.first()

array([  5.40075942,   0.35448133,   8.2530532 ,   8.21766191,
        10.90933855,  14.09262299,   0.25310378,   3.26216293,
        26.31818022,  23.73619231,   0.52304126,   1.22590479,
         0.61123782,   0.95282567,   6.2904057 ,  13.62006239])

In [16]:
# 生成所有的 (类别特征值, id) 字典
cat_id = [0,9,11,14,15,17,20]
catgories_list = []
for i in cat_id:
    # 统计不同取值
    value_list = data_train.map(lambda x:x[i]).distinct().collect()
    # 生成唯一的特征值, 避免重复
    value_list = list(map(lambda x:'{0}_{1}'.format(i,x), value_list))
    catgories_list.extend(value_list)
# 生成 {类别特征值:id} 字典
catgories_dict = {k:i for i,k in enumerate(catgories_list)}
catgories_dict

{'0_?': 2,
 '0_arts_entertainment': 0,
 '0_business': 6,
 '0_computer_internet': 1,
 '0_culture_politics': 10,
 '0_gaming': 7,
 '0_health': 13,
 '0_law_crime': 8,
 '0_recreation': 12,
 '0_religion': 4,
 '0_science_technology': 11,
 '0_sports': 9,
 '0_unknown': 5,
 '0_weather': 3,
 '11_0': 15,
 '11_1': 16,
 '14_1': 17,
 '14_?': 18,
 '15_0': 20,
 '15_1': 19,
 '17_0': 23,
 '17_1': 21,
 '17_?': 22,
 '20_0': 29,
 '20_1': 33,
 '20_10': 24,
 '20_11': 43,
 '20_12': 31,
 '20_13': 41,
 '20_14': 26,
 '20_15': 45,
 '20_16': 35,
 '20_17': 27,
 '20_18': 38,
 '20_19': 37,
 '20_2': 44,
 '20_20': 32,
 '20_21': 34,
 '20_22': 28,
 '20_3': 40,
 '20_4': 25,
 '20_5': 39,
 '20_6': 46,
 '20_7': 42,
 '20_8': 30,
 '20_9': 36,
 '9_0': 14}

In [17]:
# 定义onehot函数
def extract_cat_features(data, _id_l, dict_,):
    data_tmp = np.zeros((len(dict_)))
    for i,l in enumerate(_id_l):
        v = '{0}_{1}'.format(l, data[i])
        data_tmp[dict_[v]] = 1
    return data_tmp

In [18]:
# 类别特征的onehot处理
cat_features = data_train.map(lambda x:x[cat_id])
cat_features = cat_features.map(lambda x:extract_cat_features(x,cat_id,catgories_dict))
cat_features.first()

array([ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [19]:
np.array(([6,4,7], [8,9,0]))

array([[6, 4, 7],
       [8, 9, 0]])

In [46]:
num_features.zip(cat_features).map(lambda x:np.concatenate((x[0],x[1]))).first()

array([  5.40075942,   0.35448133,   8.2530532 ,   8.21766191,
        10.90933855,  14.09262299,   0.25310378,   3.26216293,
        26.31818022,  23.73619231,   0.52304126,   1.22590479,
         0.61123782,   0.95282567,   6.2904057 ,  13.62006239,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   1.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   1.        ,   1.        ,
         0.        ,   1.        ,   0.        ,   1.        ,
         0.        ,   0.        ,   0.        ,   1.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   1.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ])

# 生成训练数据

In [47]:
train_data = num_features.zip(cat_features)
train_data = train_data.map(lambda x:np.concatenate((x[0],x[1])))
train_data = labels.zip(train_data)
train_data = train_data.map(lambda x:LabeledPoint(x[0],x[1]))
train_data.first()

LabeledPoint(0.0, [5.400759415,0.354481330538,8.25305319902,8.21766190613,10.9093385542,14.0926229922,0.253103777329,3.26216292622,26.318180219,23.7361923148,0.523041255079,1.22590479417,0.611237823404,0.952825667702,6.29040570308,13.6200623886,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])

# 训练与预测与评价

In [48]:
train_data.cache()

PythonRDD[852] at RDD at PythonRDD.scala:49

In [49]:
# 生成并训练模型
model = NaiveBayes.train(train_data,)

In [50]:
model.predict([1.1376473365,-0.9819355716929,0.02513981289,-0.0558635644254,-0.468893253129,-0.354305326308,-0.317535217236,0.33845079824,0.828822173315,0.229639823578,-0.141625969099,-0.297996816496,-0.0329672096969,-0.0487811297558,-0.108698488525,-0.278820782314,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])

1.0

In [51]:
# 获取预测值
pre = model.predict(train_data.map(lambda x:x.features)).map(float)

In [52]:
# 打包预测值和真实值
scoreAndLabels = pre.zip(train_data.map(lambda p: p.label))

In [53]:
scoreAndLabels.first()

(1.0, 0.0)

In [54]:
# 生成评价函数
metrics = BinaryClassificationMetrics(scoreAndLabels)

In [55]:
metrics.areaUnderROC

0.6524470706796848

In [56]:
metrics.areaUnderPR

0.6338904813957008

In [57]:
model.save(sc, path+'NaiveBayes')

In [60]:
model.theta

array([[ -3.41119855,  -5.70791131,  -2.89869515,  -2.7620917 ,
         -2.44265257,  -2.2000044 ,  -5.32322032,  -3.80580312,
         -1.61896142,  -1.70524643,  -5.24650372,  -4.33257734,
         -5.47307058,  -4.94865353,  -3.0102814 ,  -2.22862235,
         -6.66967418,  -7.64153477,  -5.99098919, -11.44374291,
         -9.2919807 , -11.44374291,  -7.51191727,  -9.16136052,
        -10.10874184,  -7.33944801,  -7.8220722 ,  -7.99058578,
         -7.08960147,  -7.67328346,  -4.86449169,  -4.8866805 ,
         -8.67115418,  -5.35696818,  -5.80823927,  -5.30385835,
         -5.89778452,  -7.99693501,  -6.607461  ,  -5.11046328,
         -8.25739027,  -7.0153099 ,  -9.9176866 , -11.26142135,
        -13.05318082,  -7.16153661,  -7.50410473,  -9.01012955,
        -12.36003364,  -7.44005271, -12.36003364, -11.10727067,
         -7.75987599, -11.95456853, -11.44374291,  -7.15602695,
         -7.16985843,  -9.16136052,  -7.19239459,  -8.55337115,
         -7.3427538 , -10.41412349,  -7.

In [59]:
dir(model)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'labels',
 'load',
 'pi',
 'predict',
 'save',
 'theta']