In [1]:
# -*- coding: UTF-8 -*-
import sys
from time import time
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from pyspark import SparkConf, SparkContext
from pyspark.mllib.classification import LogisticRegressionWithSGD
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import StandardScaler
from pyspark.mllib.evaluation import BinaryClassificationMetrics



In [2]:
sparkConf = SparkConf().setAppName("LogisticRegressionWithSGD_Test")
sc = SparkContext(conf = sparkConf)
sc.master

'local[*]'

In [3]:
logger = sc._jvm.org.apache.log4j
logger.LogManager.getLogger("org"). setLevel( logger.Level.ERROR )
logger.LogManager.getLogger("akka").setLevel( logger.Level.ERROR )
logger.LogManager.getRootLogger().setLevel(logger.Level.ERROR)

In [12]:
path = 'file:///mnt/f/AI/Book/other/spark_tmp/PythonProject/'
data = sc.textFile(path+'data/train.tsv')
data.cache()

file:///mnt/f/AI/Book/other/spark_tmp/PythonProject/data/train.tsv MapPartitionsRDD[9] at textFile at NativeMethodAccessorImpl.java:0

user id \t movie id \t rating \t time

In [13]:
header = data.first()
header = header.replace('"','').split('\t')[3:]
header

['alchemy_category',
 'alchemy_category_score',
 'avglinksize',
 'commonlinkratio_1',
 'commonlinkratio_2',
 'commonlinkratio_3',
 'commonlinkratio_4',
 'compression_ratio',
 'embed_ratio',
 'framebased',
 'frameTagRatio',
 'hasDomainLink',
 'html_ratio',
 'image_ratio',
 'is_news',
 'lengthyLinkDomain',
 'linkwordscore',
 'news_front_page',
 'non_markup_alphanum_characters',
 'numberOfLinks',
 'numwords_in_url',
 'parametrizedLinkRatio',
 'spelling_errors_ratio',
 'label']

In [18]:
line_first = data.first()
rowdata = data.filter(lambda x:x!=line_first)
rowdata.first()

'"http://www.bloomberg.com/news/2010-12-23/ibm-predicts-holographic-calls-air-breathing-batteries-by-2015.html"\t"4042"\t"{""title"":""IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries"",""body"":""A sign stands outside the International Business Machines Corp IBM Almaden Research Center campus in San Jose California Photographer Tony Avelar Bloomberg Buildings stand at the International Business Machines Corp IBM Almaden Research Center campus in the Santa Teresa Hills of San Jose California Photographer Tony Avelar Bloomberg By 2015 your mobile phone will project a 3 D image of anyone who calls and your laptop will be powered by kinetic energy At least that s what International Business Machines Corp sees in its crystal ball The predictions are part of an annual tradition for the Armonk New York based company which surveys its 3 000 researchers to find five ideas expected to take root in the next five years IBM the world s largest 

In [68]:
data_train = rowdata.map(lambda x:x.replace('"','')
                        )#.map(lambda x:x.replace('?','0'))
data_train.first()

'http://www.bloomberg.com/news/2010-12-23/ibm-predicts-holographic-calls-air-breathing-batteries-by-2015.html\t4042\t{title:IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries,body:A sign stands outside the International Business Machines Corp IBM Almaden Research Center campus in San Jose California Photographer Tony Avelar Bloomberg Buildings stand at the International Business Machines Corp IBM Almaden Research Center campus in the Santa Teresa Hills of San Jose California Photographer Tony Avelar Bloomberg By 2015 your mobile phone will project a 3 D image of anyone who calls and your laptop will be powered by kinetic energy At least that s what International Business Machines Corp sees in its crystal ball The predictions are part of an annual tradition for the Armonk New York based company which surveys its 3 000 researchers to find five ideas expected to take root in the next five years IBM the world s largest provider of compute

In [69]:
data_train = data_train.map(lambda x:x.split('\t')[3:]
                           ).map(lambda x:np.array(x))
data_train.first()

array(['business', '0.789131', '2.055555556', '0.676470588', '0.205882353',
       '0.047058824', '0.023529412', '0.443783175', '0', '0', '0.09077381',
       '0', '0.245831182', '0.003883495', '1', '1', '24', '0', '5424',
       '170', '8', '0.152941176', '0.079129575', '0'], 
      dtype='<U11')

In [70]:
data_train.count()

7395

In [71]:
labels = data_train.map(lambda x:int(x[-1]))
labels.first()

0

In [74]:
t_size = len(data_train.first())
for i in range(t_size):
    line_dis = set(data_train.map(lambda x:x[i]).distinct().collect())
    print('特征: {2}--{0} \t 有[{1}]个取值'.format(header[i],len(line_dis),i))
    if '?' in line_dis:
        print('特征: ---{0} \t 有 ? ---------'.format(header[i]))

特征: 0--alchemy_category 	 有[14]个取值
特征: ---alchemy_category 	 有 ? ---------
特征: 1--alchemy_category_score 	 有[4806]个取值
特征: ---alchemy_category_score 	 有 ? ---------
特征: 2--avglinksize 	 有[5710]个取值
特征: 3--commonlinkratio_1 	 有[4476]个取值
特征: 4--commonlinkratio_2 	 有[4038]个取值
特征: 5--commonlinkratio_3 	 有[3266]个取值
特征: 6--commonlinkratio_4 	 有[2695]个取值
特征: 7--compression_ratio 	 有[6453]个取值
特征: 8--embed_ratio 	 有[366]个取值
特征: 9--framebased 	 有[1]个取值
特征: 10--frameTagRatio 	 有[5911]个取值
特征: 11--hasDomainLink 	 有[2]个取值
特征: 12--html_ratio 	 有[7376]个取值
特征: 13--image_ratio 	 有[5418]个取值
特征: 14--is_news 	 有[2]个取值
特征: ---is_news 	 有 ? ---------
特征: 15--lengthyLinkDomain 	 有[2]个取值
特征: 16--linkwordscore 	 有[101]个取值
特征: 17--news_front_page 	 有[3]个取值
特征: ---news_front_page 	 有 ? ---------
特征: 18--non_markup_alphanum_characters 	 有[5301]个取值
特征: 19--numberOfLinks 	 有[702]个取值
特征: 20--numwords_in_url 	 有[23]个取值
特征: 21--parametrizedLinkRatio 	 有[3922]个取值
特征: 22--spelling_errors_ratio 	 有[4219]个取值
特征: 23--label 	 

In [75]:
_id = 1
line_dis = set(data_train.map(lambda x:x[_id]).distinct().collect())
if '?' in line_dis:
    line_dis.remove('?')
    data_tmp = np.array(list(line_dis)).astype(float)
    print('min =', data_tmp.min())
    print('max =', data_tmp.max())

min = 0.0708333
max = 0.999426


In [76]:
def extract_num_features(data, _id, _val,):
    if data[_id] == _val: data[_id]=0
    #data_tmp = np.array(data).astype(np.float)
    return data.astype(np.float)

In [87]:
num_id = [1,2,3,4,5,6,7,8, 10, 12,13, 16, 18,19, 21,22]
num_features = data_train.map(lambda x:x[num_id])
num_features = num_features.map(lambda x:extract_num_features(x,0,'?'))
num_features.first()

array([  7.89131000e-01,   2.05555556e+00,   6.76470588e-01,
         2.05882353e-01,   4.70588240e-02,   2.35294120e-02,
         4.43783175e-01,   0.00000000e+00,   9.07738100e-02,
         2.45831182e-01,   3.88349500e-03,   2.40000000e+01,
         5.42400000e+03,   1.70000000e+02,   1.52941176e-01,
         7.91295750e-02])

In [103]:
std_scaler = StandardScaler(withMean=True, withStd=True
                           ).fit(num_features)
num_features = std_scaler.transform(num_features
                                   ).map(lambda x:np.array(x))
num_features.first()

array([ 1.13764734, -0.08193557,  1.02513981, -0.05586356, -0.46889325,
       -0.35430533, -0.31753522,  0.3384508 ,  0.82882217,  0.22963982,
       -0.14162597, -0.29799682, -0.03296721, -0.04878113, -0.10869849,
       -0.27882078])

In [61]:
cat_id = [0,9,11,14,15,17,20]
catgories_list = []
for i in cat_id:
    value_list = data_train.map(lambda x:x[i]).distinct().collect()
    value_list = list(map(lambda x:'{0}_{1}'.format(i,x), value_list))
    catgories_list.extend(value_list)
catgories_dict = {k:i for i,k in enumerate(catgories_list)}
catgories_dict

{'0_?': 2,
 '0_arts_entertainment': 0,
 '0_business': 6,
 '0_computer_internet': 1,
 '0_culture_politics': 10,
 '0_gaming': 7,
 '0_health': 13,
 '0_law_crime': 8,
 '0_recreation': 12,
 '0_religion': 4,
 '0_science_technology': 11,
 '0_sports': 9,
 '0_unknown': 5,
 '0_weather': 3,
 '11_0': 15,
 '11_1': 16,
 '14_1': 17,
 '14_?': 18,
 '15_0': 20,
 '15_1': 19,
 '17_0': 23,
 '17_1': 21,
 '17_?': 22,
 '20_0': 29,
 '20_1': 33,
 '20_10': 24,
 '20_11': 43,
 '20_12': 31,
 '20_13': 41,
 '20_14': 26,
 '20_15': 45,
 '20_16': 35,
 '20_17': 27,
 '20_18': 38,
 '20_19': 37,
 '20_2': 44,
 '20_20': 32,
 '20_21': 34,
 '20_22': 28,
 '20_3': 40,
 '20_4': 25,
 '20_5': 39,
 '20_6': 46,
 '20_7': 42,
 '20_8': 30,
 '20_9': 36,
 '9_0': 14}

In [83]:
def extract_cat_features(data, _id_l, dict_,):
    data_tmp = np.zeros((len(dict_)))
    for i,l in enumerate(_id_l):
        v = '{0}_{1}'.format(l, data[i])
        data_tmp[dict_[v]] = 1
    return data_tmp

In [84]:
cat_features = data_train.map(lambda x:x[cat_id])
cat_features = cat_features.map(lambda x:extract_cat_features(x,cat_id,catgories_dict))
cat_features.first()

array([ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  1.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  0.,  1.,  0.,  0.,
        0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [92]:
np.array(([6,4,7], [8,9,0]))

array([[6, 4, 7],
       [8, 9, 0]])

In [109]:
num_features.zip(cat_features).map(lambda x:np.concatenate((x[0],x[1]))).first()

array([ 1.13764734, -0.08193557,  1.02513981, -0.05586356, -0.46889325,
       -0.35430533, -0.31753522,  0.3384508 ,  0.82882217,  0.22963982,
       -0.14162597, -0.29799682, -0.03296721, -0.04878113, -0.10869849,
       -0.27882078,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        1.        ,  1.        ,  0.        ,  1.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  1.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ])

In [110]:
train_data = num_features.zip(cat_features)
train_data = train_data.map(lambda x:np.concatenate((x[0],x[1])))
train_data = labels.zip(train_data)
train_data = train_data.map(lambda x:LabeledPoint(x[0],x[1]))
train_data.first()

LabeledPoint(0.0, [1.1376473365,-0.0819355716929,1.02513981289,-0.0558635644254,-0.468893253129,-0.354305326308,-0.317535217236,0.33845079824,0.828822173315,0.229639823578,-0.141625969099,-0.297996816496,-0.0329672096969,-0.0487811297558,-0.108698488525,-0.278820782314,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])

In [111]:
train_data.cache()

PythonRDD[1316] at RDD at PythonRDD.scala:49

In [112]:
model = LogisticRegressionWithSGD.train(train_data,)

In [118]:
model.predict([1.1376473365,-0.9819355716929,0.02513981289,-0.0558635644254,-0.468893253129,-0.354305326308,-0.317535217236,0.33845079824,0.828822173315,0.229639823578,-0.141625969099,-0.297996816496,-0.0329672096969,-0.0487811297558,-0.108698488525,-0.278820782314,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])

1

In [124]:
pre = model.predict(train_data.map(lambda x:x.features)).map(float)

In [125]:
scoreAndLabels = pre.zip(train_data.map(lambda p: p.label))

In [126]:
scoreAndLabels.first()

(1.0, 0.0)

In [127]:
metrics = BinaryClassificationMetrics(scoreAndLabels)

In [128]:
metrics.areaUnderROC

0.6630759744467127

In [129]:
metrics.areaUnderPR

0.6342929872966844

In [130]:
model.save(sc, path+'LogisticRegressionWithSGD')

In [131]:
model.numFeatures

63

In [132]:
dir(model)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slotnames__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_coeff',
 '_dataWithBiasSize',
 '_intercept',
 '_numClasses',
 '_numFeatures',
 '_threshold',
 '_weightsMatrix',
 'clearThreshold',
 'intercept',
 'load',
 'numClasses',
 'numFeatures',
 'predict',
 'save',
 'setThreshold',
 'threshold',
 'weights']