### Minimal example

In [None]:
import tuplex
import time

In [None]:
c = tuplex.Context(executorMemory='4G', executorCount=63, driverMemory='4G')

In [None]:
c.parallelize([1, 2, 3, 4]).map(lambda x: x * x).collect()

### Exception handling

In [None]:
ds = c.parallelize([(1, 10), (2, 20), (100, 0), (6, 60)]).map(lambda a, b: a / b)


ds.collect()

In [None]:
ds.exception_counts

In [None]:
ds.resolve(ZeroDivisionError, lambda x: 0).collect()

### A more involved query

In [None]:
def extractPrice(x):
    price = x['price']
    p = 0
    if x['offer'] == 'sold':
        # price is to be calculated using price/sqft * sqft
        val = x['facts and features']
        s = val[val.find('Price/sqft:') + len('Price/sqft:') + 1:]
        r = s[s.find('$')+1:s.find(', ') - 1]
        price_per_sqft = int(r)
        p = price_per_sqft * x['sqft']
    elif x['offer'] == 'rent':
        max_idx = price.rfind('/')
        p = int(price[1:max_idx].replace(',', ''))
    else:
        # take price from price column
        p = int(price[1:].replace(',', ''))

    return p

def extractType(x):
    t = x['title'].lower()
    type = 'unknown'
    if 'condo' in t or 'apartment' in t:
        type = 'condo'
    if 'house' in t:
        type = 'house'
    return type

def extractBd(x):
    val = x['facts and features']
    max_idx = val.find(' bd')
    if max_idx < 0:
        max_idx = len(val)
    s = val[:max_idx]

    # find comma before
    split_idx = s.rfind(',')
    if split_idx < 0:
        split_idx = 0
    else:
        split_idx += 2
    r = s[split_idx:]
    return int(r)

def extractSqft(x):
    val = x['facts and features']
    max_idx = val.find(' sqft')
    if max_idx < 0:
        max_idx = len(val)
    s = val[:max_idx]

    split_idx = s.rfind('ba ,')
    if split_idx < 0:
        split_idx = 0
    else:
        split_idx += 5
    r = s[split_idx:]
    r = r.replace(',', '')
    return int(r)

def extractOffer(x):
    offer = x['title'].lower()
    if 'sale' in offer:
        return 'sale'
    if 'rent' in offer:
        return 'rent'
    if 'sold' in offer:
        return 'sold'
    if 'foreclose' in offer.lower():
        return 'foreclosed'
    return offer


tstart = time.time()
ds = c.csv('/hot/data/zillow/large10GB.csv')

output_path = '/hot/scratch/out.csv'

ds.withColumn("bedrooms", extractBd) \
    .filter(lambda x: x['bedrooms'] < 10) \
    .withColumn("type", extractType) \
    .filter(lambda x: x['type'] == 'condo') \
    .withColumn("zipcode", lambda x: '%05d' % int(x['postal_code'])) \
    .withColumn("sqft", extractSqft) \
    .ignore(ValueError) \
    .withColumn("offer", extractOffer) \
    .withColumn("price", extractPrice) \
    .ignore(ValueError) \
    .selectColumns(["url", "zipcode", 
                    "bedrooms", "type", "price"]) \
    .tocsv(output_path)


print('processing 10GB of input data took: {:.2f}s'.format(time.time() - tstart))

In [None]:
!head /hot/scratch/out.part0.csv