## Numpy

In [None]:
%pyspark


a = [1,2,3,'qoo', 4]
a = [1,3,5,7,9]
b = [2,4,6,8,10]

res = []
for i in range(0, len(a)):
    res.append(a[i] * b[i])
print(res)

res = []
for i,j in zip(a,b):
    res.append(i * j)
print(res)


[i*j for i,j in zip(a,b)]


import numpy 
na = numpy.array(a)
nb = numpy.array(b)

na * nb

## Series

In [None]:
%pyspark

import pandas
sa = pandas.Series([1,2,3,4,5], index = ['a', 'b', 'c', 'd', 'e'])
sa['a']

## DataFrame

In [None]:
%pyspark
import pandas as pd
df = pd.DataFrame([['frank', 'M', 29], ['mary', 'F', 23], ['tom', 'M', 35], ['ted', 'M', 33], ['jean', 'F', 21], ['lisa', 'F', 20]])
#df.info()
#print(df)
df.columns = ['name', 'gender', 'age']
#print(df)

#df.loc[[0,1,2] , ['name', 'gender'] ]

df['age'].mean()
df['age'].max()
df['age'].min()
df['age'].describe()

df['age'][0]

df['age'][0:3]


df.describe()
df.iloc[0]
df.iloc[0:3]

df['name']

df[['name', 'age']]

df[df['gender'] == 'M']
df.loc[df['gender'] == 'M',  ['name', 'age']  ]

df.loc[df['gender'] == 'F',  ['age']  ].mean()
df.loc[df['gender'] == 'M',  ['age']  ].mean()

## PySpark DataFrame

### Register SqlContext

In [None]:
%pyspark
from pyspark.sql import SQLContext 
sqlContext = SQLContext(sc)

df = sqlContext.createDataFrame(row_data)
df.registerTempTable("ratings")

### PySpark DataFrame Operation

In [None]:
%pyspark
#df.take(5)
#df.show(10)
df.select('userid', 'rating').groupBy('userid').avg().show(5)

### Print Data Schema

In [None]:
%pyspark
df.printSchema()

### PySpark SQL

In [None]:
%pyspark
df.registerTempTable("ratings")
ratings_data = sqlContext.sql("""
     SELECT itemid,avg(rating) as avg_rating  from ratings group by itemid order by avg_rating desc 
""")
ratings_data.show(5)

### Use toPandas to Convert Spark DataFrame Back To Pandas DataFrame

In [None]:
%pyspark
pandas_df = ratings_data.toPandas()
pandas_df.columns = ['itemid', 'sum_rating']
pandas_df.head(5)

### Use rdd to Transform Spark DataFrame Back to RDD

In [None]:
%pyspark
ratings_out = ratings_data.rdd.map(lambda p : 'itemid:{} - average rating: {}'.format(p.itemid, p.avg_rating))
for ele in ratings_out.take(3):
    print(ele)

### Count DataFrame Lines

In [None]:
%pyspark
ratings_data.count()

### Join

In [None]:
%pyspark
x = sc.parallelize([("a", 1), ("b", 4)]) 
y = sc.parallelize([("a", 2), ("a", 3)])
z = x.join(y)
res = z.collect()
sorted(res)

In [None]:
%pyspark
x = sc.parallelize([("a", 1), ("b", 4)]) 
y = sc.parallelize([("a", 2)]) 
sorted(x.leftOuterJoin(y).collect())

In [None]:
%pyspark
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2)]) 
sorted(y.rightOuterJoin(x).collect())

In [None]:
%pyspark
x = sc.parallelize([("a", 1), ("b", 4)])
y = sc.parallelize([("a", 2), ("c", 8)]) 
sorted(x.fullOuterJoin(y).collect())

### Table Join By PySpark SQL

In [None]:
%pyspark
from pyspark.sql import SQLContext 
from pyspark.sql import Row


sqlContext = SQLContext(sc)

ratings = sc.textFile('file:/tmp/u.data', 4)
ratings_data = ratings.map(lambda l:l.split())
ratings_row_data = ratings_data.map(lambda p: 
    Row( userid=p[0], movieid=p[1], rating=int(p[2]) )
)
ratings_row_data.take(4)

df = sqlContext.createDataFrame(ratings_row_data)
df.registerTempTable("ratings")

movies = sc.textFile('file:/tmp/u.item', 4)

movies_data = movies.map(lambda l:l.split('|'))
#movies_data.take(3)
movies_row_data = movies_data.map(lambda p: 
    Row(movieid=p[0], moviename=p[1] )
)
movies_row_data.take(4)

ratings_df = sqlContext.createDataFrame(ratings_row_data)
ratings_df.registerTempTable("ratings")

movies_df = sqlContext.createDataFrame(movies_row_data)
movies_df.registerTempTable("movies")

best_movies = sqlContext.sql("""
     SELECT moviename,avg(rating) as avg_rating, count(1) as cnt  from movies inner join ratings on ratings.movieid = movies.movieid group by moviename order by  avg(rating) desc limit 10
""")
best_movies.show(5)


### DataFrame Join

In [None]:
%pyspark
m = ratings_df.join(movies_df, movies_df.movieid == ratings_df.movieid) \
  .groupBy(movies_df.moviename).agg({"rating": "avg"})
m.show(5)

## Decision Tree

### 資料預處理

In [None]:
%pyspark
raw_data = sc.textFile('file:/tmp/customer_churn.csv')
raw_data.take(3)
header = raw_data.first()
skip_data = raw_data.filter(lambda line: line != header)

skip_data.take(3)
splitlines = skip_data.map(lambda l: l.split(","))

splitlines.take(3)

def parseLine(col):
    features = []
    churn    = col[-1] 
    international  = 0 if col[4] == '"no"' else 1
    voice          = 0 if col[5] == '"no"' else 1
    label          = 0 if churn  == '"no"' else 1
    features.append(international)
    features.append(voice)
    features += col[6:-1]
    return LabeledPoint(label, Vectors.dense(features) )
    
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

trainData = splitlines.map(parseLine)
#trainData.take(100)

### 建立模型

In [None]:
%pyspark
from pyspark.mllib.tree import DecisionTree
       
model = DecisionTree.trainClassifier(trainData, numClasses=2, categoricalFeaturesInfo={},
impurity='gini', maxDepth=5)
         

### 繪製決策樹

In [None]:
%pyspark
print("Learned classification tree model:") 
print(model.toDebugString())

### 單筆資料預測

In [None]:
%pyspark
head = trainData.first()
head
model.predict(head.features)

### 產生批次預測結果

In [None]:
%pyspark
predictions = model.predict(trainData.map(lambda p:
p.features))
predictions.take(5)

### 評估準確度

In [None]:
%pyspark
labels_and_preds = trainData.map(lambda p: p.label).zip(predictions)
#labels_and_preds.take(100)
filtered_labels_and_preds = labels_and_preds.filter(lambda v : v[0] == v[1]) 
test_accuracy = filtered_labels_and_preds.count() / float(trainData.count())
test_accuracy

### 產生Confusion Matrix

In [None]:
%pyspark
from collections import Counter
c = Counter(labels_and_preds.collect())
c

### 計算 AUC

In [None]:
%pyspark
from pyspark.mllib.evaluation import BinaryClassificationMetrics
metrics = BinaryClassificationMetrics(labels_and_preds)
#print(dir(metrics))

print("Area under PR = %s" % metrics.areaUnderPR) 
print("Area under ROC = %s" % metrics.areaUnderROC)

### 拿randomforest 進行評估

In [None]:
%pyspark
from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.util import MLUtils

model = RandomForest.trainRegressor(trainData, categoricalFeaturesInfo={},
                                    numTrees=100, featureSubsetStrategy="auto",
                                    impurity='variance', maxDepth=5)
predictions = model.predict(trainData.map(lambda p:
p.features))
labels_and_preds = trainData.map(lambda p: p.label).zip(predictions)
metrics = BinaryClassificationMetrics(labels_and_preds)
print("Area under PR = %s" % metrics.areaUnderPR) 
print("Area under ROC = %s" % metrics.areaUnderROC)

### 產生訓練與測試資料集

In [None]:
train_test_dataset = trainData.randomSplit([0.7,0.3])
trainset = train_test_dataset[0]
testset  = train_test_dataset[1]

### 根據訓練資料集建立模型

In [None]:
model = DecisionTree.trainClassifier(trainset, numClasses=2, categoricalFeaturesInfo={},
impurity='gini', maxDepth=5)

### 利用測試資料集驗證模型

In [None]:
predictions = model.predict(testset.map(lambda p:
p.features))
labels_and_preds = testset.map(lambda p: p.label).zip(predictions)
metrics = BinaryClassificationMetrics(labels_and_preds)

print("Area under PR = %s" % metrics.areaUnderPR) 
print("Area under ROC = %s" % metrics.areaUnderROC)