**File ingestion**

In [None]:
rdd = sc.textFile("users.csv")

In [None]:
rdd.count()

In [None]:
rdd.take(5)

In [None]:
payments = sc.parallelize([50,50,100,100,100,100])

In [None]:
payments.collect()

**Partitioning, **Caching** and immutability**

In [None]:
users = rdd.repartition(4)

In [None]:
rdd.getNumPartitions()

In [None]:
users.getNumPartitions()

In [None]:
users.persist(StorageLevel.MEMORY_ONLY)

**Filtering**

In [None]:
budapestiek = users.filter(lambda x: 'Budapest' in x)

In [None]:
budapestiek.count()

In [None]:
users.filter(lambda x: 'Budapest' in x).filter(lambda x: ',22' in x).take(5)

**Mapping**

In [None]:
users.map(lambda x: x.split(",")).take(2)

In [None]:
records = users.map(lambda x: x.split(","))

In [None]:
records.map(lambda x: int(x[2])).take(2)

In [None]:
records.map(lambda x: int(x[2])).filter(lambda x: x > 30).count()

In [None]:
records.map(lambda x: x[1]).distinct().collect()

**Sum and Average**

In [None]:
payments.sum()

In [None]:
payments.reduce(lambda x, y: x+y)

In [None]:
users.map(lambda x: int(x.split(",")[2])).aggregate((0,0), lambda x,y: (x[0]+1,x[1]+y), lambda x,y: (x[0]+y[0] , x[1]+y[1]))

In [None]:
users.count()

**KeyRDDs**

In [None]:
payments = sc.textFile("payments.csv")

In [None]:
u = records.map(lambda x: (int(x[0]), x))

In [None]:
pr = payments.map(lambda x: x.split(","))
p = payments.map(lambda x: (int(x.split(",")[1]), x.split(",")))

In [None]:
p.take(5)

In [None]:
pr.map(lambda x: (x[0],int(x[2]))).groupByKey().map(lambda x: (x[0],list(x[1]))).take(2)

In [None]:
pr.map(lambda x: (x[0],int(x[2]))).reduceByKey(lambda x, y: x + y).collect()

In [None]:
u.join(p).take(5)

** Accumulators **

In [None]:
d = {
    'Budapest': 'Budapest',
    'Debrecen': 'Hajdu-Bihar',
    'Gyor': 'Gyor-Moson-Sopron',
    'Sopron': 'Gyor-Moson-Sopron'
}
megyek = sc.broadcast(d)

In [None]:
pr.filter(lambda x: x[3] == 'MasterCard').map(lambda x: (int(x[1]),int(x[2]))).join(u)\
   .map(lambda x: (megyek.value[ x[1][1][1] ], int(x[1][0]))).reduceByKey(lambda x, y: x + y).collect()

In [None]:
ucleansed = records.map(lambda x: [int(x[0]), x[1], int(x[2])])

**DataFrames**

In [None]:
df = ucleansed.toDF(['id','city','age'])

In [None]:
df.schema

In [None]:
df.printSchema()

In [None]:
df.first()

In [None]:
df.take(1)

In [None]:
l = df.limit(10)

In [None]:
l.show()

In [None]:
l.toPandas()

** groupping **

In [None]:
p = l.toPandas()
p.groupby("city").mean()

In [None]:
df.count()

In [None]:
df.filter(df["age"] >= 30).count()

In [None]:
df.select(df['city'], df['age'])

In [None]:
df.select(df['city']).distinct().toPandas()

In [None]:
g = df.groupBy(df["city"])
g.avg().toPandas()

** SQL **

In [None]:
df.registerTempTable("users");

In [None]:
result = sqlContext.sql("SELECT city, count(*) as cnt FROM users GROUP BY city ORDER BY cnt DESC")

In [None]:
result.toPandas()

In [None]:
result.write.json()