## 安裝 Spark

In [0]:
! apt-get install openjdk-8-jdk-headless -qq > /dev/null

In [0]:
! wget -q https://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz

In [0]:
! tar xf spark-2.4.5-bin-hadoop2.7.tgz

In [0]:
! pip install -q findspark

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-1.8.0-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

In [0]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [0]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

In [8]:
sc

##  建立客戶流失預測模型

In [9]:
! wget "https://raw.githubusercontent.com/ywchiu/pyspark/master/data/customer_churn.csv"


--2020-05-13 11:42:58--  https://raw.githubusercontent.com/ywchiu/pyspark/master/data/customer_churn.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 356734 (348K) [text/plain]
Saving to: ‘customer_churn.csv’


2020-05-13 11:42:59 (11.0 MB/s) - ‘customer_churn.csv’ saved [356734/356734]



In [10]:
! head customer_churn.csv

"","state","account_length","area_code","international_plan","voice_mail_plan","number_vmail_messages","total_day_minutes","total_day_calls","total_day_charge","total_eve_minutes","total_eve_calls","total_eve_charge","total_night_minutes","total_night_calls","total_night_charge","total_intl_minutes","total_intl_calls","total_intl_charge","number_customer_service_calls","churn"
"1","KS",128,"area_code_415","no","yes",25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10,3,2.7,1,"no"
"2","OH",107,"area_code_415","no","yes",26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,"no"
"3","NJ",137,"area_code_415","no","no",0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,"no"
"4","OH",84,"area_code_408","yes","no",0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,"no"
"5","OK",75,"area_code_415","yes","no",0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,"no"
"6","AL",118,"area_code_510","yes","no",0,223.4,98,37.98,220.6,101,18.75,203.9,118,9.18,6.3,6

In [11]:
raw_data = sc.textFile("customer_churn.csv")
raw_data

customer_churn.csv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [12]:
raw_data.take(3)

['"","state","account_length","area_code","international_plan","voice_mail_plan","number_vmail_messages","total_day_minutes","total_day_calls","total_day_charge","total_eve_minutes","total_eve_calls","total_eve_charge","total_night_minutes","total_night_calls","total_night_charge","total_intl_minutes","total_intl_calls","total_intl_charge","number_customer_service_calls","churn"',
 '"1","KS",128,"area_code_415","no","yes",25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10,3,2.7,1,"no"',
 '"2","OH",107,"area_code_415","no","yes",26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,"no"']

In [13]:
header = raw_data.first()
header

'"","state","account_length","area_code","international_plan","voice_mail_plan","number_vmail_messages","total_day_minutes","total_day_calls","total_day_charge","total_eve_minutes","total_eve_calls","total_eve_charge","total_night_minutes","total_night_calls","total_night_charge","total_intl_minutes","total_intl_calls","total_intl_charge","number_customer_service_calls","churn"'

In [14]:
skip_data = raw_data.filter(lambda line: line != header)
skip_data.take(3)

['"1","KS",128,"area_code_415","no","yes",25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10,3,2.7,1,"no"',
 '"2","OH",107,"area_code_415","no","yes",26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,"no"',
 '"3","NJ",137,"area_code_415","no","no",0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,"no"']

- 結構化資料
- 沒有缺失值
- 所有資料都必須是數值資料

In [15]:
splitlines = skip_data.map(lambda l: l.split(","))
splitlines.take(3)

[['"1"',
  '"KS"',
  '128',
  '"area_code_415"',
  '"no"',
  '"yes"',
  '25',
  '265.1',
  '110',
  '45.07',
  '197.4',
  '99',
  '16.78',
  '244.7',
  '91',
  '11.01',
  '10',
  '3',
  '2.7',
  '1',
  '"no"'],
 ['"2"',
  '"OH"',
  '107',
  '"area_code_415"',
  '"no"',
  '"yes"',
  '26',
  '161.6',
  '123',
  '27.47',
  '195.5',
  '103',
  '16.62',
  '254.4',
  '103',
  '11.45',
  '13.7',
  '3',
  '3.7',
  '1',
  '"no"'],
 ['"3"',
  '"NJ"',
  '137',
  '"area_code_415"',
  '"no"',
  '"no"',
  '0',
  '243.4',
  '114',
  '41.38',
  '121.2',
  '110',
  '10.3',
  '162.6',
  '104',
  '7.32',
  '12.2',
  '5',
  '3.29',
  '0',
  '"no"']]

In [0]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vectors, Matrices

def parseLine(col):
    features = []
    churn = col[-1]
    international = 0 if col[4] == '"no"' else 1
    voice = 0 if col[5] == '"no"'  else 1
    label = 0 if churn == '"no"' else 1
    features.append(international)
    features.append(voice)
    features += col[6:-1]
    return LabeledPoint(label, Vectors.dense(features))

In [0]:
trainData = splitlines.map(parseLine)

In [20]:
trainData.take(3)

[LabeledPoint(0.0, [0.0,1.0,25.0,265.1,110.0,45.07,197.4,99.0,16.78,244.7,91.0,11.01,10.0,3.0,2.7,1.0]),
 LabeledPoint(0.0, [0.0,1.0,26.0,161.6,123.0,27.47,195.5,103.0,16.62,254.4,103.0,11.45,13.7,3.0,3.7,1.0]),
 LabeledPoint(0.0, [0.0,0.0,0.0,243.4,114.0,41.38,121.2,110.0,10.3,162.6,104.0,7.32,12.2,5.0,3.29,0.0])]

In [0]:
from pyspark.mllib.tree import DecisionTree

model = DecisionTree.trainClassifier(trainData, numClasses=2, 
                    categoricalFeaturesInfo={},impurity='gini', maxDepth=5)


In [22]:
print("Learned classification tree model:")
print(model.toDebugString())


Learned classification tree model:
DecisionTreeModel classifier of depth 5 with 47 nodes
  If (feature 3 <= 264.75)
   If (feature 15 <= 3.5)
    If (feature 0 <= 0.5)
     If (feature 3 <= 221.5)
      Predict: 0.0
     Else (feature 3 > 221.5)
      If (feature 6 <= 259.85)
       Predict: 0.0
      Else (feature 6 > 259.85)
       Predict: 1.0
    Else (feature 0 > 0.5)
     If (feature 13 <= 2.5)
      Predict: 1.0
     Else (feature 13 > 2.5)
      If (feature 12 <= 12.95)
       Predict: 0.0
      Else (feature 12 > 12.95)
       Predict: 1.0
   Else (feature 15 > 3.5)
    If (feature 3 <= 161.35000000000002)
     If (feature 6 <= 230.25)
      Predict: 1.0
     Else (feature 6 > 230.25)
      If (feature 3 <= 138.55)
       Predict: 1.0
      Else (feature 3 > 138.55)
       Predict: 0.0
    Else (feature 3 > 161.35000000000002)
     If (feature 6 <= 156.05)
      If (feature 3 <= 193.35000000000002)
       Predict: 1.0
      Else (feature 3 > 193.35000000000002)
       Predict:

In [23]:
head = trainData.first()
head

LabeledPoint(0.0, [0.0,1.0,25.0,265.1,110.0,45.07,197.4,99.0,16.78,244.7,91.0,11.01,10.0,3.0,2.7,1.0])

In [24]:
head.features

DenseVector([0.0, 1.0, 25.0, 265.1, 110.0, 45.07, 197.4, 99.0, 16.78, 244.7, 91.0, 11.01, 10.0, 3.0, 2.7, 1.0])

In [25]:
print(model.predict(head.features))

0.0


In [0]:
predictions = model.predict(trainData.map(lambda p: p.features))

In [28]:
predictions.take(3)

[0.0, 0.0, 0.0]

In [0]:
labels_and_preds = trainData.map(lambda p: p.label).zip(predictions)

In [0]:
from pyspark.mllib.evaluation import MulticlassMetrics
metrics = MulticlassMetrics(labels_and_preds)

In [31]:
precision = metrics.precision()
recall = metrics.recall()
f1Score = metrics.fMeasure()

print("Summary Stats")
print("Precision = %s" % precision)
print("Recall = %s" % recall)
print("F1 Score = %s" % f1Score)


Summary Stats
Precision = 0.9492949294929492
Recall = 0.9492949294929492
F1 Score = 0.9492949294929492


In [33]:
dir(metrics.confusionMatrix())

['__UDT__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_convert_to_array',
 'asML',
 'isTransposed',
 'numCols',
 'numRows',
 'toArray',
 'toSparse',
 'values']

In [34]:
metrics.confusionMatrix().toArray()

array([[2817.,  136.],
       [  33.,  347.]])

In [35]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics

metrics = BinaryClassificationMetrics(labels_and_preds)

print("Area under PR = %s" % metrics.areaUnderPR)
print("Area under ROC = %s" % metrics.areaUnderROC)


Area under PR = 0.6921821611713312
Area under ROC = 0.9335515176359455


## 降低維度
- 欄位篩選 (Feature Selection)
- 欄位萃取 (Feature Extraction)

### 欄位篩選

In [0]:
import pandas
df = pandas.read_csv('customer_churn.csv', index_col = 0)

In [0]:
df = df.iloc[:,3:]

In [41]:
df.head()

Unnamed: 0,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
1,no,yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,no
2,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,no
3,no,no,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,no
4,yes,no,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,no
5,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,no


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3333 entries, 1 to 3333
Data columns (total 17 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   international_plan             3333 non-null   object 
 1   voice_mail_plan                3333 non-null   object 
 2   number_vmail_messages          3333 non-null   int64  
 3   total_day_minutes              3333 non-null   float64
 4   total_day_calls                3333 non-null   int64  
 5   total_day_charge               3333 non-null   float64
 6   total_eve_minutes              3333 non-null   float64
 7   total_eve_calls                3333 non-null   int64  
 8   total_eve_charge               3333 non-null   float64
 9   total_night_minutes            3333 non-null   float64
 10  total_night_calls              3333 non-null   int64  
 11  total_night_charge             3333 non-null   float64
 12  total_intl_minutes             3333 non-null   f

In [0]:
df['international_plan'] = df['international_plan'].replace({'no': 0, 'yes': 1})
df['voice_mail_plan'] = df['voice_mail_plan'].replace({'no': 0, 'yes': 1})
df['churn'] = df['churn'].replace({'no': 0, 'yes': 1})

In [44]:
df.head()

Unnamed: 0,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
1,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,0
2,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0
3,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,0
4,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0
5,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0


In [0]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1].values

In [0]:
from itertools import combinations
total_features = len(X.columns)
ary = []
for i in range(1, total_features + 1):
  for ele in combinations(X.columns, i):
    ary.append(ele)

In [51]:
len(ary)

65535

In [0]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# create the RFE model and select 3 attributes
clf_LR = LogisticRegression(C=1e30)
clf_LR.fit(X,y)

rfe = RFE(clf_LR, 10)
rfe = rfe.fit(X, y)

# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

In [0]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
# create the RFE model and select 3 attributes
clf_LR = LogisticRegression(C=1e30)
clf_LR.fit(X,y)

rfe = RFECV(clf_LR,  step=1, cv=5)
rfe = rfe.fit(X, y)

# summarize the selection of the attributes
print(rfe.support_)
print(rfe.ranking_)

In [56]:
from sklearn.decomposition import PCA 
pca = PCA(n_components=10, whiten=True)
X_train_pca = pca.fit_transform(X)
explained_variance = pca.explained_variance_
print('Expected Variance is '+ str(explained_variance))

Expected Variance is [3.05388403e+03 2.61220489e+03 2.54077186e+03 4.05279651e+02
 3.97223653e+02 3.79196617e+02 1.87423545e+02 8.37522809e+00
 6.03140001e+00 1.72770261e+00]


## Spark Streaming
- https://github.com/ywchiu/chtspark/blob/master/Streaming/Streaming.py
- https://github.com/ywchiu/chtspark/blob/master/Streaming/StreamingWindow.py