In [3]:
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.tree import RandomForest, RandomForestModel

from pyspark.mllib.util import MLUtils

%pylab inline

import numpy as np
from time import time

from DistributedBoosting import *

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [5]:
from pyspark import SparkContext

sc=SparkContext()

In [6]:
from numpy.random import rand
p=0.9
data=[]
for i in range(10):
    for j in range(10):
        if np.abs(i-4)<3 and np.abs(j-6)<3:
            y=2*(rand()<p)-1
        else:
            y=2*(rand()>p)-1
        print("%1.0f "%((1+y)/2), end=' ')
        data.append(LabeledPoint(y,[i,j]))
    print()

dataRDD=sc.parallelize(data,numSlices=2)
dataRDD.getNumPartitions()

0  0  0  0  0  0  0  0  0  1  
0  0  0  0  0  0  0  0  0  1  
0  1  0  0  0  1  1  0  1  0  
0  0  0  0  1  1  1  1  1  0  
0  0  0  0  1  1  1  1  1  1  
1  0  0  0  1  1  0  1  1  1  
0  0  0  0  1  1  1  1  1  1  
0  0  0  0  0  0  0  0  0  0  
0  0  0  0  1  0  0  0  0  0  
0  0  0  0  0  1  0  0  0  0  


2

In [8]:
# %load DistributedBoosting.py
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.tree import RandomForest, RandomForestModel

from pyspark.mllib.util import MLUtils

import numpy as np
from time import time

In [9]:
class Timer:
    """A simple service class to log run time and pretty-print it.
    """
    def __init__(self):
        self.T=[]
    def stamp(self,name):
        self.T.append((name,time()))
    def str(self):
        T=self.T
        return '\n'.join(['%6.2f : %s'%(T[i+1][1]-T[i][1],T[i+1][0]) for i in range(len(T)-1)])

In [10]:
###### Globals
global T,iteration,GR,proposals,Strong_Classifier, feature_no, partition_no, Splits_Table
global Strong_Classifier,global_best_splitter,PS

T=Timer()
feature_no=None                 # Tracks processing time
global_feature_no=None
partition_no=0
iteration=0                     # Boosting iteration
PS=[None]                       # RDD that hold state of boosting process for each partition.
proposals=[]                    # proposed splits for each feature
Strong_Classifier=[]            # Combined weak classifiers
#############################################################

In [11]:
##### Partition fundctions
def Prepare_partition_data_structure(A):

    rows=len(A[1])

    columns=np.empty([feature_no,rows])
    columns[:]=np.NaN
    print('Prepare_partition_data_structure',feature_no,np.shape(columns))
    
    labels=np.empty(rows)
    labels[:]=np.NaN

    for j in range(rows):
        LP=A[1][j]
        labels[j]=LP.label
        for i in range(feature_no):
            columns[i,j]=LP.features[i]
    return {'index':A[0],\
            'labels':labels,\
            'weights':np.ones(len(labels)),\
            'feature_values':columns}

def Add_weak_learner_matrix(A):
    """ This procedure adds to each partition the matrix that will be 
        used to efficiently find the best weak classifier """

    try:
        feature_no
    except:
        feature_no=global_feature_no.value

    index=A['index']%feature_no
    SP=Splits_Table.value[index]

    Col=A['feature_values'][index,:]

    ### The matrix M is organized as follows: 
    # * There are as many rows as there are thresholds in SP (last one is inf)
    # * There are as many columns as there are examples in this partition.
    # For threshold i, the i'th rw of M is +1 if Col is smaller than the trehold SP[i] and -1 otherwise

    M=np.empty([len(SP),len(Col)])
    M[:]=np.NaN

    for i in range(len(SP)):
        M[i,:]=2*(Col<SP[i])-1

    A['M']=M # add M matrix to the data structure.
    return A


def Find_weak(A):
    """Find the best split for a single feature on a single partition"""

    try:
        feature_no
    except:
        feature_no=global_feature_no.value

    index=A['index']%feature_no
    SP=Splits_Table.value[index]

    M=A['M']
    weights=A['weights']
    weighted_Labels=weights*A['labels']
    SS=np.dot(M,weighted_Labels)/np.sum(weights)
    i_max=np.argmax(np.abs(SS))
    answer={'Feature_index':A['index']%feature_no,\
            'Threshold_index':i_max,\
            'Threshold':SP[i_max],\
            'Correlation':SS[i_max],\
            'SS':SS}
    return answer

# update weights. New splitter is shipped to partition as one of the referenced
# Variables

def update_weights(A):
    """Update the weights of the exammples belonging to this 
    partition according to the new splitter"""
    best_splitter=global_best_splitter
    F_index=best_splitter['Feature_index']
    Thr=best_splitter['Threshold']
    alpha=best_splitter['alpha']
    y_hat=2*(A['feature_values'][F_index,:]<Thr)-1
    y=A['labels']
    weights=A['weights']*exp(-alpha*y_hat*y)
    weights /= sum(weights)
    A['weights']=weights
    return A

def calc_scores(Strong_Classifier,Columns,Lbl):
    
    Scores=np.zeros(len(Lbl))

    for h in Strong_Classifier:
        index=h['Feature_index']
        Thr=h['Threshold']
        alpha=h['alpha']
        y_hat=2*(Columns[index,:]<Thr)-1
        Scores += alpha*y_hat*Lbl
    return Scores

In [12]:
###### Head-Node functions
def init(sc,Data):
    """ Given an RDD with labeled Points, create the RDD of data structures used for boosting
    """

    global T,iteration,GR,proposals,Strong_Classifier, feature_no, partition_no, Splits_Table
    global Strong_Classifier,global_best_splitter

    T=Timer()
    T.stamp('Started')

    X=Data.first()
    feature_no=len(X.features)
#    print 'global_feature_no = sc.broadcast(feature_no)',feature_no
    partition_no=Data.getNumPartitions()
    if partition_no != feature_no:
        Data=Data.repartition(feature_no).cache()
    print('number of features=',feature_no,'number of partitions=',Data.getNumPartitions())

    # Split data into training and test
    (trainingData,testData)=Data.randomSplit([0.7,0.3])
    print('Sizes: Data1=%d, trainingData=%d, testData=%d'%      (Data.count(),trainingData.cache().count(),testData.cache().count()))
    T.stamp('Split into train and test')
    # Glom each partition into a local array
    G=trainingData.glom()
    GTest=testData.glom()  
    T.stamp('glom')

    # Add an index to each partition to identify it.
    def f(splitIndex, iterator): yield splitIndex,next(iterator)
    GI=G.mapPartitionsWithIndex(f)
    GTI=GTest.mapPartitionsWithIndex(f)
    T.stamp('add partition index')

    return GI

In [13]:
GI=init(sc,dataRDD)

number of features= 2 number of partitions= 2
Sizes: Data1=100, trainingData=69, testData=31


In [14]:
GI.collect()

[(0,
  [LabeledPoint(-1.0, [0.0,0.0]),
   LabeledPoint(-1.0, [0.0,2.0]),
   LabeledPoint(-1.0, [0.0,3.0]),
   LabeledPoint(-1.0, [0.0,4.0]),
   LabeledPoint(-1.0, [0.0,7.0]),
   LabeledPoint(-1.0, [0.0,8.0]),
   LabeledPoint(1.0, [0.0,9.0]),
   LabeledPoint(-1.0, [1.0,0.0]),
   LabeledPoint(-1.0, [1.0,2.0]),
   LabeledPoint(-1.0, [1.0,3.0]),
   LabeledPoint(-1.0, [1.0,4.0]),
   LabeledPoint(-1.0, [1.0,6.0]),
   LabeledPoint(-1.0, [1.0,7.0]),
   LabeledPoint(-1.0, [1.0,8.0]),
   LabeledPoint(1.0, [1.0,9.0]),
   LabeledPoint(-1.0, [2.0,0.0]),
   LabeledPoint(1.0, [2.0,1.0]),
   LabeledPoint(-1.0, [2.0,2.0]),
   LabeledPoint(-1.0, [2.0,3.0]),
   LabeledPoint(1.0, [2.0,5.0]),
   LabeledPoint(1.0, [2.0,6.0]),
   LabeledPoint(1.0, [2.0,8.0]),
   LabeledPoint(-1.0, [3.0,1.0]),
   LabeledPoint(-1.0, [3.0,2.0]),
   LabeledPoint(1.0, [3.0,4.0]),
   LabeledPoint(1.0, [3.0,6.0]),
   LabeledPoint(1.0, [3.0,7.0]),
   LabeledPoint(1.0, [3.0,8.0]),
   LabeledPoint(-1.0, [4.0,0.0]),
   LabeledPoint(-1.

In [1]:
def init2(GI):
    # Prepare the data structure for each partition.
    GR=GI.map(Prepare_partition_data_structure)
    print('number of elements in GR=', GR.cache().count())
    T.stamp('Prepare_partition_data_structure')
    return GR

#compute the split points for each feature
    Splits=find_splits(GR)
    print('Split points=',Splits
    T.stamp('Compute Split points')

    #broadcast split points
    global Splits_Table
    Splits_Table=sc.broadcast(Splits)
    T.stamp('Broadcast split points')

    # Create matrix for each partition to make finding the weak rules correlation a matter of taking a matrix product

    iteration=0
    global PS
    PS[0]=GR.map(Add_weak_learner_matrix)
    print 'number of partitions in PS=',PS[0].cache().count()
    T.stamp('Add_weak_learner_matrix')

    return PS

SyntaxError: invalid syntax (<ipython-input-1-90779a94a670>, line 11)

In [17]:
GR=init2(GI)
GR.collect()

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 5.0 failed 1 times, most recent failure: Lost task 0.0 in stage 5.0 (TID 9, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/anaconda3/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 229, in main
    process()
  File "/anaconda3/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 224, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/anaconda3/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 372, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/Users/srihariveeraraaghavan/CSE_255/edX-Micro-Master-in-Data-Science/big-data-analytics-using-spark/notebooks/Section4-Classification/DistributedBoosting/DistributedBoosting.py", line 42, in Prepare_partition_data_structure
    columns=np.empty([feature_no,rows])
TypeError: 'NoneType' object cannot be interpreted as an integer

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsBytes(MemoryStore.scala:378)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1109)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1083)
	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1018)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1083)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:809)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:286)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1599)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1587)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1586)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1586)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:831)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:831)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1820)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1769)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1758)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:642)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2048)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2092)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:938)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:153)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/anaconda3/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 229, in main
    process()
  File "/anaconda3/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 224, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/anaconda3/lib/python3.6/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 372, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/Users/srihariveeraraaghavan/CSE_255/edX-Micro-Master-in-Data-Science/big-data-analytics-using-spark/notebooks/Section4-Classification/DistributedBoosting/DistributedBoosting.py", line 42, in Prepare_partition_data_structure
    columns=np.empty([feature_no,rows])
TypeError: 'NoneType' object cannot be interpreted as an integer

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsBytes(MemoryStore.scala:378)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1109)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1083)
	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:1018)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1083)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:809)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:335)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:286)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [None]:
def boosting_iteration(k=1):
    """ perform k boosting iterations """
    for i in range(iteration,iteration+k):
        T.stamp('Start main loop %d'%i)

        prop=PS[i].map(Find_weak).collect()
        proposals.append(prop)
        corrs=[p['Correlation'] for p in prop]
        best_splitter_index=np.argmax(np.abs(corrs))
        best_splitter = prop[best_splitter_index]
        Strong_Classifier.append(best_splitter)
        global global_Strong_Classifier
        global_Strong_Classifier=sc.broadcast(Strong_Classifier)
        T.stamp('found best splitter %d'%i)

        corr=best_splitter['Correlation']
        best_splitter['alpha']=0.5*np.log((1+corr)/(1-corr))
        global global_best_splitter
        global_best_splitter = sc.broadcast(best_splitter)
        PS.append(PS[i].map(update_weights))
        T.stamp('Updated Weights %d'%i)
    iteration+=k

In [None]:
def find_splits(GR,number_of_bins=10,debug=False):
    """Compute the split points for each feature to create number_of_bins bins"""
    def find_split_points(A):

        try:
            feature_no
        except:
            feature_no=global_feature_no.value

        j=A['index'] % feature_no
        S=np.sort(A['feature_values'][j,:       ])
        L=len(S) 
        step=L/number_of_bins+2*number_of_bins
        return (j,S[range(0,L,step)])

    global partition_no
    Splits=GR.map(find_split_points).collect()
    max_no=np.array([np.finfo(float).max])

    # Average the split points across the partitions corresponding to the same feature.
    Splits1=[]
    for i in range(feature_no):
        S=Splits[i][1]
        if debug:
            print 'no. ',i,' = ',Splits[i]
        n=1  # number of copies (for averaging)
        j=i+feature_no
        while j<partition_no:
            if debug:
                print 'j=',j
            S+=Splits[j][1]
            if debug:
                print 'no. ',j,' = ',Splits[j]
            n+=1.0
            j+=feature_no
        Splits1.append(np.concatenate([S/n,max_no]))
        if debug:
            print n
            print Splits1[i]
            print '='*60

    return Splits1