In [1]:
import os
import sys

spark_path = os.environ['SPARK_HOME']
sys.path.append(spark_path + "/bin")
sys.path.append(spark_path + "/python")
sys.path.append(spark_path + "/python/pyspark/")
sys.path.append(spark_path + "/python/lib")
sys.path.append(spark_path + "/python/lib/pyspark.zip")
sys.path.append(spark_path + "/python/lib/py4j-0.10.9-src.zip")

import findspark
findspark.init()

import pyspark

In [2]:
number_cores = 8
memory_gb = 16
conf = (pyspark.SparkConf().setMaster('local[{}]'.format(number_cores)).set('spark.driver.memory', '{}g'.format(memory_gb)))
sc = pyspark.SparkContext(conf=conf)

In [3]:
!dir .\data

dir: cannot access '.data': No such file or directory


In [4]:
spark = pyspark.sql.SparkSession(sc)

In [5]:
df = spark.read.csv('./data/bank.csv', header = True, inferSchema = True)
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- deposit: string (nullable = true)



In [6]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import SVMWithSGD

data = [
    LabeledPoint(0.0, [0.0]),
    LabeledPoint(1.0, [1.0]),
    LabeledPoint(1.0, [2.0]),
    LabeledPoint(1.0, [3.0])
]
svm = SVMWithSGD.train(sc.parallelize(data), iterations=10)
svm.predict([1.0])

1

In [7]:
svm.predict(sc.parallelize([[1.0]])).collect()

[1]

In [8]:
import numpy 

svm.clearThreshold()
svm.predict(numpy.array([1.0]))

1.4407421280634385

In [9]:
from pyspark.mllib.linalg import Matrix, Matrices, Vectors, DenseMatrix, SparseVector

sparse_data = [
    LabeledPoint(0.0, SparseVector(2, {0: -1.0})),
    LabeledPoint(1.0, SparseVector(2, {1: 1.0})),
    LabeledPoint(0.0, SparseVector(2, {0: 0.0})),
    LabeledPoint(1.0, SparseVector(2, {1: 2.0}))
]
svm = SVMWithSGD.train(sc.parallelize(sparse_data), iterations=10)
svm.predict(SparseVector(2, {1: 1.0}))

1

In [10]:
#from pyspark.mllib.classification import SVMModel
#import os, tempfile
#path = tempfile.mkdtemp()
#svm.save(sc, path)
#sameModel = SVMModel.load(sc, path)
svm.predict(SparseVector(2, {1: 1.0}))

1

In [11]:
svm.predict(SparseVector(2, {0: -1.0}))

0

In [12]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- deposit: string (nullable = true)



### Question: Can you predict whether a client will subscribe to a term deposit (feature deposit)?

### Problems:
- What data should the bank data be converted to?
- How to handle categorical data?

- Dense or sparse? Dense

### Data organization

LabeledPoint(deposit, [age, job, marital, education, ....])


In [13]:
df.take(5)

[Row(age=59, job='admin.', marital='married', education='secondary', default='no', balance=2343, housing='yes', loan='no', contact='unknown', day=5, month='may', duration=1042, campaign=1, pdays=-1, previous=0, poutcome='unknown', deposit='yes'),
 Row(age=56, job='admin.', marital='married', education='secondary', default='no', balance=45, housing='no', loan='no', contact='unknown', day=5, month='may', duration=1467, campaign=1, pdays=-1, previous=0, poutcome='unknown', deposit='yes'),
 Row(age=41, job='technician', marital='married', education='secondary', default='no', balance=1270, housing='yes', loan='no', contact='unknown', day=5, month='may', duration=1389, campaign=1, pdays=-1, previous=0, poutcome='unknown', deposit='yes'),
 Row(age=55, job='services', marital='married', education='secondary', default='no', balance=2476, housing='yes', loan='no', contact='unknown', day=5, month='may', duration=579, campaign=1, pdays=-1, previous=0, poutcome='unknown', deposit='yes'),
 Row(age=5

In [14]:
unique_deposit = df.select('deposit').distinct().collect()
print(unique_deposit)

[Row(deposit='no'), Row(deposit='yes')]


In [15]:
columns = df.columns
print(columns)

['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'deposit']


In [16]:
cat_columns = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'deposit']

In [17]:
cat_dictionary = {}

for c in cat_columns:
    unique_c = df.select(c).distinct().collect()
    #print(c + ": " + str(len(unique_c)))
    cat_dictionary[c] = {}
    i = 0
    for v in unique_c:
        cat_dictionary[c][v[c]] = i
        i += 1

In [18]:
# feature vector that we will multiply the weights to
print(cat_dictionary)

{'job': {'management': 0, 'retired': 1, 'unknown': 2, 'self-employed': 3, 'student': 4, 'blue-collar': 5, 'entrepreneur': 6, 'admin.': 7, 'technician': 8, 'services': 9, 'housemaid': 10, 'unemployed': 11}, 'marital': {'divorced': 0, 'married': 1, 'single': 2}, 'education': {'unknown': 0, 'tertiary': 1, 'secondary': 2, 'primary': 3}, 'default': {'no': 0, 'yes': 1}, 'housing': {'no': 0, 'yes': 1}, 'loan': {'no': 0, 'yes': 1}, 'contact': {'unknown': 0, 'cellular': 1, 'telephone': 2}, 'month': {'jun': 0, 'aug': 1, 'may': 2, 'feb': 3, 'sep': 4, 'mar': 5, 'oct': 6, 'jul': 7, 'nov': 8, 'apr': 9, 'dec': 10, 'jan': 11}, 'poutcome': {'success': 0, 'unknown': 1, 'other': 2, 'failure': 3}, 'deposit': {'no': 0, 'yes': 1}}


In [19]:
def dataPrep(r):
    key = 0
    value = []
    for c in columns:
        if c == 'deposit':
            key = cat_dictionary[c][r[columns.index(c)]]
        else:
            if c in cat_columns:
                value.append(cat_dictionary[c][r[columns.index(c)]])
            else:
                value.append(r[columns.index(c)])
    return LabeledPoint(key, value)

In [20]:
cat_dictionary['deposit']['no']

0

In [21]:
# values of each feature for each row
df_clean = df.rdd.map(dataPrep)
df_clean.take(5)

[LabeledPoint(1.0, [59.0,7.0,1.0,2.0,0.0,2343.0,1.0,0.0,0.0,5.0,2.0,1042.0,1.0,-1.0,0.0,1.0]),
 LabeledPoint(1.0, [56.0,7.0,1.0,2.0,0.0,45.0,0.0,0.0,0.0,5.0,2.0,1467.0,1.0,-1.0,0.0,1.0]),
 LabeledPoint(1.0, [41.0,8.0,1.0,2.0,0.0,1270.0,1.0,0.0,0.0,5.0,2.0,1389.0,1.0,-1.0,0.0,1.0]),
 LabeledPoint(1.0, [55.0,9.0,1.0,2.0,0.0,2476.0,1.0,0.0,0.0,5.0,2.0,579.0,1.0,-1.0,0.0,1.0]),
 LabeledPoint(1.0, [54.0,7.0,1.0,1.0,0.0,184.0,0.0,0.0,0.0,5.0,2.0,673.0,2.0,-1.0,0.0,1.0])]

In [22]:
columns.index("deposit")

16

In [35]:
# Split data for training set and testing set (in reality you want 3, training, testing, and validating)
df_svm = df_clean.randomSplit([0.8, 0.2], 1234)

In [24]:
print(df_clean.count())
print(df_svm[0].count())
print(df_svm[1].count())

11162
8925
2237


In [25]:
svm_bank = SVMWithSGD.train(df_svm[0], iterations=200)

In [26]:
svm_bank.predict([41.0,8.0,1.0,2.0,0.0,1270.0,1.0,0.0,0.0,5.0,2.0,1389.0,1.0,-1.0,0.0,1.0])

1

In [27]:
def testPrediction(p):
    prediction = svm_bank.predict(p.features)
    if prediction == p.label:
        return ("correct", 1)
    else:
        return ("incorrect", 1)

In [28]:
df_results = df_svm[1].map(testPrediction).reduceByKey(lambda x, y: x + y)
df_results.collect()

[('correct', 1272), ('incorrect', 965)]

In [29]:
1272 / 2237.0

0.5686186857398301

In [30]:
df_svm[1].sample(False, 0.01).collect()

[LabeledPoint(1.0, [41.0,5.0,1.0,2.0,0.0,1384.0,1.0,0.0,0.0,15.0,2.0,1162.0,4.0,-1.0,0.0,1.0]),
 LabeledPoint(1.0, [45.0,5.0,0.0,3.0,0.0,594.0,1.0,0.0,0.0,29.0,2.0,833.0,2.0,-1.0,0.0,1.0]),
 LabeledPoint(1.0, [28.0,3.0,1.0,2.0,0.0,123.0,0.0,1.0,2.0,22.0,6.0,313.0,1.0,-1.0,0.0,1.0]),
 LabeledPoint(1.0, [37.0,9.0,1.0,2.0,0.0,1970.0,1.0,0.0,2.0,5.0,8.0,253.0,1.0,-1.0,0.0,1.0]),
 LabeledPoint(1.0, [59.0,1.0,0.0,2.0,0.0,514.0,0.0,0.0,1.0,4.0,2.0,673.0,6.0,90.0,1.0,0.0]),
 LabeledPoint(1.0, [36.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,8.0,2.0,308.0,3.0,-1.0,0.0,1.0]),
 LabeledPoint(1.0, [54.0,11.0,1.0,2.0,0.0,582.0,1.0,0.0,1.0,15.0,2.0,693.0,5.0,372.0,2.0,3.0]),
 LabeledPoint(1.0, [26.0,0.0,2.0,1.0,0.0,1623.0,0.0,0.0,1.0,2.0,0.0,479.0,1.0,-1.0,0.0,1.0]),
 LabeledPoint(1.0, [72.0,1.0,1.0,0.0,0.0,1940.0,0.0,0.0,2.0,4.0,10.0,705.0,2.0,-1.0,0.0,1.0]),
 LabeledPoint(1.0, [29.0,11.0,2.0,0.0,0.0,1584.0,0.0,0.0,1.0,6.0,4.0,245.0,1.0,-1.0,0.0,1.0]),
 LabeledPoint(1.0, [38.0,6.0,2.0,2.0,0.0,2543.0,0.0,0.0,1.

In [31]:
x = LabeledPoint(1.0, [34.0,9.0,1.0,2.0,0.0,-538.0,1.0,0.0,0.0,26.0,2.0,682.0,1.0,-1.0,0.0,1.0])

In [32]:
x

LabeledPoint(1.0, [34.0,9.0,1.0,2.0,0.0,-538.0,1.0,0.0,0.0,26.0,2.0,682.0,1.0,-1.0,0.0,1.0])

In [33]:
svm_bank.predict([34.0,9.0,1.0,2.0,0.0,-538.0,1.0,0.0,0.0,26.0,2.0,682.0,1.0,-1.0,0.0,1.0])

0

In [34]:
sc.stop()