# Basics

In [1]:
from pyspark.sql.functions import col
from pyspark.sql.functions import sum as _sum
from pyspark.sql.types import FloatType, IntegerType, StringType, StructField, StructType

### RDD Sample

In [2]:
values = [
    [1, 'grocery', 'Trader Joes', 53.46, 'weekly groceries'],
    [2, 'dining', 'Chiptole', 7.87, 'burrito for lunch'],
    [3, 'upkeep', 'CVS', 6.59, None],
    [4, 'upkeep', 'Chevron', 43.13, 'car gas fillup'],
    [5, 'grocery', 'Trader Joes', 5.42, 'dessert :)']
]
rdd = sc.parallelize(values)
print('Count: {count}'.format(count=rdd.count()))
rdd.collect()

Count: 5


[[1, 'grocery', 'Trader Joes', 53.46, 'weekly groceries'],
 [2, 'dining', 'Chiptole', 7.87, 'burrito for lunch'],
 [3, 'upkeep', 'CVS', 6.59, None],
 [4, 'upkeep', 'Chevron', 43.13, 'car gas fillup'],
 [5, 'grocery', 'Trader Joes', 5.42, 'dessert :)']]

### DataFrame Sample

In [3]:
schema = StructType([
    StructField(name='transaction_id', dataType=IntegerType(), nullable=False),
    StructField(name='type', dataType=StringType(), nullable=False),
    StructField(name='vendor', dataType=StringType(), nullable=False),
    StructField(name='amount', dataType=FloatType(), nullable=False),
    StructField(name='description', dataType=StringType(), nullable=True)
])
df = spark.createDataFrame(rdd, schema)
print('Count: {count}'.format(count=df.count()))
df.collect()

Count: 5


[Row(transaction_id=1, type='grocery', vendor='Trader Joes', amount=53.459999084472656, description='weekly groceries'),
 Row(transaction_id=2, type='dining', vendor='Chiptole', amount=7.869999885559082, description='burrito for lunch'),
 Row(transaction_id=3, type='upkeep', vendor='CVS', amount=6.590000152587891, description=None),
 Row(transaction_id=4, type='upkeep', vendor='Chevron', amount=43.130001068115234, description='car gas fillup'),
 Row(transaction_id=5, type='grocery', vendor='Trader Joes', amount=5.420000076293945, description='dessert :)')]

### Pivoting Example

In [4]:
category_agg_df = df \
    .withColumn('category', col('type')) \
    .groupBy('category') \
    .agg(_sum('amount').alias('total')) \
    .select('category', 'total')
print('Count: {count}'.format(count=category_agg_df.count()))
category_agg_df.collect()

Count: 3


[Row(category='grocery', total=58.8799991607666),
 Row(category='dining', total=7.869999885559082),
 Row(category='upkeep', total=49.720001220703125)]