In [52]:
# -*- coding: UTF-8 -*-
import sys
from time import time
import pandas as pd
import numpy as np

"""import matplotlib.pyplot as plt

from pyspark import SparkConf, SparkContext
from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import StandardScaler
from pyspark.mllib.evaluation import BinaryClassificationMetrics
"""
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row

from pyspark.sql import udf
from pyspark.sql.functions import  col
import pyspark.sql.types

from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler

In [5]:
sparkConf = SparkConf().setAppName("NaiveBayes_Test")
sc = SparkContext(conf = sparkConf)
sc.master

'local[*]'

In [6]:
sqlContext = SparkSession.builder.getOrCreate()

In [7]:
logger = sc._jvm.org.apache.log4j
logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )
logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR)

In [8]:
path = 'file:///mnt/f/AI/Book/other/spark_tmp/PythonProject/'
data = sqlContext.read.csv(path+'data/train.tsv', header=True, sep='\t')
data.cache()

DataFrame[url: string, urlid: string, boilerplate: string, alchemy_category: string, alchemy_category_score: string, avglinksize: string, commonlinkratio_1: string, commonlinkratio_2: string, commonlinkratio_3: string, commonlinkratio_4: string, compression_ratio: string, embed_ratio: string, framebased: string, frameTagRatio: string, hasDomainLink: string, html_ratio: string, image_ratio: string, is_news: string, lengthyLinkDomain: string, linkwordscore: string, news_front_page: string, non_markup_alphanum_characters: string, numberOfLinks: string, numwords_in_url: string, parametrizedLinkRatio: string, spelling_errors_ratio: string, label: string]

In [9]:
data.printSchema()

root
 |-- url: string (nullable = true)
 |-- urlid: string (nullable = true)
 |-- boilerplate: string (nullable = true)
 |-- alchemy_category: string (nullable = true)
 |-- alchemy_category_score: string (nullable = true)
 |-- avglinksize: string (nullable = true)
 |-- commonlinkratio_1: string (nullable = true)
 |-- commonlinkratio_2: string (nullable = true)
 |-- commonlinkratio_3: string (nullable = true)
 |-- commonlinkratio_4: string (nullable = true)
 |-- compression_ratio: string (nullable = true)
 |-- embed_ratio: string (nullable = true)
 |-- framebased: string (nullable = true)
 |-- frameTagRatio: string (nullable = true)
 |-- hasDomainLink: string (nullable = true)
 |-- html_ratio: string (nullable = true)
 |-- image_ratio: string (nullable = true)
 |-- is_news: string (nullable = true)
 |-- lengthyLinkDomain: string (nullable = true)
 |-- linkwordscore: string (nullable = true)
 |-- news_front_page: string (nullable = true)
 |-- non_markup_alphanum_characters: string (nulla

In [10]:
data.count()

7395

In [11]:
data.first()

Row(url='http://www.bloomberg.com/news/2010-12-23/ibm-predicts-holographic-calls-air-breathing-batteries-by-2015.html', urlid='4042', boilerplate='"{""title"":""IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries"",""body"":""A sign stands outside the International Business Machines Corp IBM Almaden Research Center campus in San Jose California Photographer Tony Avelar Bloomberg Buildings stand at the International Business Machines Corp IBM Almaden Research Center campus in the Santa Teresa Hills of San Jose California Photographer Tony Avelar Bloomberg By 2015 your mobile phone will project a 3 D image of anyone who calls and your laptop will be powered by kinetic energy At least that s what International Business Machines Corp sees in its crystal ball The predictions are part of an annual tradition for the Armonk New York based company which surveys its 3 000 researchers to find five ideas expected to take root in the next five year

user id \t movie id \t rating \t time

In [18]:
data = data.replace('?','0', ['alchemy_category_score', 
                              'is_news', 'news_front_page', ])

In [42]:
header = np.array(data.columns[3:])
num_ids = [1,2,3,4,5,6,7,8, 10, 12,13, 16, 18,19, 21,22]
num_list = header[num_ids]
cat_ids = [0,9,11,14,15,17,20]
cat_list = header[cat_ids]

In [49]:
cat_list

array(['alchemy_category', 'framebased', 'hasDomainLink', 'is_news',
       'lengthyLinkDomain', 'news_front_page', 'numwords_in_url'], 
      dtype='<U30')

In [51]:
train_data = data.select(data.alchemy_category_score.cast('Double'),
                        data.avglinksize.cast('Double'),
                        data.commonlinkratio_1.cast('Double'),
                        data.commonlinkratio_2.cast('Double'),
                        data.commonlinkratio_3.cast('Double'),
                        data.commonlinkratio_4.cast('Double'),
                        data.compression_ratio.cast('Double'),
                        data.embed_ratio.cast('Double'),
                        data.frameTagRatio.cast('Double'),
                        data.html_ratio.cast('Double'),
                        data.image_ratio.cast('Double'),
                        data.linkwordscore.cast('Double'),
                        data.non_markup_alphanum_characters.cast('Double'),
                        data.numberOfLinks.cast('Double'),
                        data.parametrizedLinkRatio.cast('Double'),
                        data.spelling_errors_ratio.cast('Double'),
                        
                         'framebased', 
                         'hasDomainLink', 
                         'is_news',
                         'lengthyLinkDomain', 
                         'news_front_page', 
                         'numwords_in_url',
                         'alchemy_category', 
                        )
train_data.dtypes

[('alchemy_category_score', 'double'),
 ('avglinksize', 'double'),
 ('commonlinkratio_1', 'double'),
 ('commonlinkratio_2', 'double'),
 ('commonlinkratio_3', 'double'),
 ('commonlinkratio_4', 'double'),
 ('compression_ratio', 'double'),
 ('embed_ratio', 'double'),
 ('frameTagRatio', 'double'),
 ('html_ratio', 'double'),
 ('image_ratio', 'double'),
 ('linkwordscore', 'double'),
 ('non_markup_alphanum_characters', 'double'),
 ('numberOfLinks', 'double'),
 ('parametrizedLinkRatio', 'double'),
 ('spelling_errors_ratio', 'double'),
 ('framebased', 'string'),
 ('hasDomainLink', 'string'),
 ('is_news', 'string'),
 ('lengthyLinkDomain', 'string'),
 ('news_front_page', 'string'),
 ('numwords_in_url', 'string'),
 ('alchemy_category', 'string')]

In [53]:
cat_indexer = StringIndexer(inputCol='alchemy_category', 
                            outputCol='alchemy_category_indexer')

In [54]:
train_data_trafor = cat_indexer.fit(train_data)
train_data = train_data_trafor.transform(train_data)
train_data.dtypes

[('alchemy_category_score', 'double'),
 ('avglinksize', 'double'),
 ('commonlinkratio_1', 'double'),
 ('commonlinkratio_2', 'double'),
 ('commonlinkratio_3', 'double'),
 ('commonlinkratio_4', 'double'),
 ('compression_ratio', 'double'),
 ('embed_ratio', 'double'),
 ('frameTagRatio', 'double'),
 ('html_ratio', 'double'),
 ('image_ratio', 'double'),
 ('linkwordscore', 'double'),
 ('non_markup_alphanum_characters', 'double'),
 ('numberOfLinks', 'double'),
 ('parametrizedLinkRatio', 'double'),
 ('spelling_errors_ratio', 'double'),
 ('framebased', 'string'),
 ('hasDomainLink', 'string'),
 ('is_news', 'string'),
 ('lengthyLinkDomain', 'string'),
 ('news_front_page', 'string'),
 ('numwords_in_url', 'string'),
 ('alchemy_category', 'string'),
 ('alchemy_category_indexer', 'double')]

In [43]:
def extract_num_features(data, _id, _val,):
    if data[_id] == _val: data[_id]=0
    #data_tmp = np.array(data).astype(np.float)
    return data.astype(np.float) + 1

In [44]:
num_id = [1,2,3,4,5,6,7,8, 10, 12,13, 16, 18,19, 21,22]
num_features = data_train.map(lambda x:x[num_id])
num_features = num_features.map(lambda x:extract_num_features(x,0,'?'))
num_features.first()

array([  1.78913100e+00,   3.05555556e+00,   1.67647059e+00,
         1.20588235e+00,   1.04705882e+00,   1.02352941e+00,
         1.44378318e+00,   1.00000000e+00,   1.09077381e+00,
         1.24583118e+00,   1.00388349e+00,   2.50000000e+01,
         5.42500000e+03,   1.71000000e+02,   1.15294118e+00,
         1.07912958e+00])

withMean=False

In [45]:
std_scaler = StandardScaler(withMean=False, withStd=True
                           ).fit(num_features)
num_features = std_scaler.transform(num_features
                                   ).map(lambda x:np.array(x))
num_features.first()

array([  5.40075942,   0.35448133,   8.2530532 ,   8.21766191,
        10.90933855,  14.09262299,   0.25310378,   3.26216293,
        26.31818022,  23.73619231,   0.52304126,   1.22590479,
         0.61123782,   0.95282567,   6.2904057 ,  13.62006239])

In [16]:
cat_id = [0,9,11,14,15,17,20]
catgories_list = []
for i in cat_id:
    value_list = data_train.map(lambda x:x[i]).distinct().collect()
    value_list = list(map(lambda x:'{0}_{1}'.format(i,x), value_list))
    catgories_list.extend(value_list)
catgories_dict = {k:i for i,k in enumerate(catgories_list)}
catgories_dict

{'0_?': 2,
 '0_arts_entertainment': 0,
 '0_business': 6,
 '0_computer_internet': 1,
 '0_culture_politics': 10,
 '0_gaming': 7,
 '0_health': 13,
 '0_law_crime': 8,
 '0_recreation': 12,
 '0_religion': 4,
 '0_science_technology': 11,
 '0_sports': 9,
 '0_unknown': 5,
 '0_weather': 3,
 '11_0': 15,
 '11_1': 16,
 '14_1': 17,
 '14_?': 18,
 '15_0': 20,
 '15_1': 19,
 '17_0': 23,
 '17_1': 21,
 '17_?': 22,
 '20_0': 29,
 '20_1': 33,
 '20_10': 24,
 '20_11': 43,
 '20_12': 31,
 '20_13': 41,
 '20_14': 26,
 '20_15': 45,
 '20_16': 35,
 '20_17': 27,
 '20_18': 38,
 '20_19': 37,
 '20_2': 44,
 '20_20': 32,
 '20_21': 34,
 '20_22': 28,
 '20_3': 40,
 '20_4': 25,
 '20_5': 39,
 '20_6': 46,
 '20_7': 42,
 '20_8': 30,
 '20_9': 36,
 '9_0': 14}

In [17]:
def extract_cat_features(data, _id_l, dict_,):
    data_tmp = np.zeros((len(dict_)))
    for i,l in enumerate(_id_l):
        v = '{0}_{1}'.format(l, data[i])
        data_tmp[dict_[v]] = 1
    return data_tmp

In [18]:
cat_features = data_train.map(lambda x:x[cat_id])
cat_features = cat_features.map(lambda x:extract_cat_features(x,cat_id,catgories_dict))
cat_features.first()

array([ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [19]:
np.array(([6,4,7], [8,9,0]))

array([[6, 4, 7],
       [8, 9, 0]])

In [46]:
num_features.zip(cat_features).map(lambda x:np.concatenate((x[0],x[1]))).first()

array([  5.40075942,   0.35448133,   8.2530532 ,   8.21766191,
        10.90933855,  14.09262299,   0.25310378,   3.26216293,
        26.31818022,  23.73619231,   0.52304126,   1.22590479,
         0.61123782,   0.95282567,   6.2904057 ,  13.62006239,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   1.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   1.        ,   1.        ,
         0.        ,   1.        ,   0.        ,   1.        ,
         0.        ,   0.        ,   0.        ,   1.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   1.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ])

In [47]:
train_data = num_features.zip(cat_features)
train_data = train_data.map(lambda x:np.concatenate((x[0],x[1])))
train_data = labels.zip(train_data)
train_data = train_data.map(lambda x:LabeledPoint(x[0],x[1]))
train_data.first()

LabeledPoint(0.0, [5.400759415,0.354481330538,8.25305319902,8.21766190613,10.9093385542,14.0926229922,0.253103777329,3.26216292622,26.318180219,23.7361923148,0.523041255079,1.22590479417,0.611237823404,0.952825667702,6.29040570308,13.6200623886,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])

In [48]:
train_data.cache()

PythonRDD[852] at RDD at PythonRDD.scala:49

In [49]:
model = NaiveBayes.train(train_data,)

In [50]:
model.predict([1.1376473365,-0.9819355716929,0.02513981289,-0.0558635644254,-0.468893253129,-0.354305326308,-0.317535217236,0.33845079824,0.828822173315,0.229639823578,-0.141625969099,-0.297996816496,-0.0329672096969,-0.0487811297558,-0.108698488525,-0.278820782314,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])

1.0

In [51]:
pre = model.predict(train_data.map(lambda x:x.features)).map(float)

In [52]:
scoreAndLabels = pre.zip(train_data.map(lambda p: p.label))

In [53]:
scoreAndLabels.first()

(1.0, 0.0)

In [54]:
metrics = BinaryClassificationMetrics(scoreAndLabels)

In [55]:
metrics.areaUnderROC

0.6524470706796848

In [56]:
metrics.areaUnderPR

0.6338904813957008

In [57]:
model.save(sc, path+'NaiveBayes')

In [60]:
model.theta

array([[ -3.41119855,  -5.70791131,  -2.89869515,  -2.7620917 ,
         -2.44265257,  -2.2000044 ,  -5.32322032,  -3.80580312,
         -1.61896142,  -1.70524643,  -5.24650372,  -4.33257734,
         -5.47307058,  -4.94865353,  -3.0102814 ,  -2.22862235,
         -6.66967418,  -7.64153477,  -5.99098919, -11.44374291,
         -9.2919807 , -11.44374291,  -7.51191727,  -9.16136052,
        -10.10874184,  -7.33944801,  -7.8220722 ,  -7.99058578,
         -7.08960147,  -7.67328346,  -4.86449169,  -4.8866805 ,
         -8.67115418,  -5.35696818,  -5.80823927,  -5.30385835,
         -5.89778452,  -7.99693501,  -6.607461  ,  -5.11046328,
         -8.25739027,  -7.0153099 ,  -9.9176866 , -11.26142135,
        -13.05318082,  -7.16153661,  -7.50410473,  -9.01012955,
        -12.36003364,  -7.44005271, -12.36003364, -11.10727067,
         -7.75987599, -11.95456853, -11.44374291,  -7.15602695,
         -7.16985843,  -9.16136052,  -7.19239459,  -8.55337115,
         -7.3427538 , -10.41412349,  -7.

In [59]:
dir(model)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'labels',
 'load',
 'pi',
 'predict',
 'save',
 'theta']