## Pandas Series and DataFrame

In [None]:
%pyspark 

import pandas as pd

df = pd.DataFrame([['frank', 'M', 29], ['mary', 'F', 23], ['tom', 'M', 35], ['ted', 'M', 33], ['jean', 'F', 21], ['lisa', 'F', 20]])

df.columns = ['name', 'gender', 'age']
df


s = pd.Series([11, 22, 33, 44, 55])
s

s.max()
s.min()
s.mean()
s.describe()

s[2]
s[2:4]

s.index = ['a', 'b', 'c', 'd', 'e']
s

s['c']

age  = pd.Series([22,34,42])
name = pd.Series(['mary', 'toby', 'sherry'])

pd.DataFrame([name, age]).T


df = pd.DataFrame([['frank', 'M', 29], ['mary', 'F', 23], ['tom', 'M', 35], ['ted', 'M', 33], ['jean', 'F', 21], ['lisa', 'F', 20]])

df.columns = ['name', 'gender', 'age'] 
df

df.describe()
df.ix[1]

df.ix[1:4]

df[['name', 'age']]


df['gender'] == 'M'
df[df['gender'] == 'M']

df[df['gender'] == 'M'].mean()
df[df['gender'] == 'F'].mean()

df.groupby('gender')['age'].mean()

## SparkSQL

In [None]:
%pyspark
from pyspark.sql import SQLContext 
sqlContext = SQLContext(sc)


data_file = "file:///tmp/ratings.txt" 
raw_data = sc.textFile(data_file)
raw_data.take(3)

header = raw_data.first()
header

skip_data = raw_data.filter(lambda line: line != header)
skip_data.take(3)

csv_data = skip_data.map(lambda l: l.split('::'))
csv_data.take(3)

from pyspark.sql import Row
row_data = csv_data.map(lambda p: Row(
   userid = p[0],
   itemid = p[1],
   rating = int(p[2])
)
)
row_data.take(3)

### Spark DataFrame 操作

In [None]:
%pyspark
df = sqlContext.createDataFrame(row_data)
#df.show(5)
#df.take(5)
# select itemid, rating from df where rating >= 4 limit 5
df.filter('rating >= 4').select('itemid', 'rating').show(5)
df.select('userid','rating').groupBy('userid').avg().show()

### SparkSQL

In [None]:
%pyspark
df.registerTempTable("ratings")
#df.printSchema()
ratings_data = sqlContext.sql("""
SELECT itemid,avg(rating) as avg_rating from ratings group by itemid order by avg(rating) desc limit 5
""")
ratings_data.show()



### 將 Spark DataFrame 轉換為 rdd

In [None]:
rating_out  = ratings_data.rdd.map(lambda e: 'itemid: {}, rating: {}'.format(e.itemid, e.avg_rating))
rating_out.take(3)


### 將 Spark DataFrame 轉換為 Pandas DataFrame 

In [None]:
pandas_df = ratings_data.toPandas()
pandas_df