In [None]:
!pip install pyspark

In [7]:
from pyspark.sql import SparkSession,SQLContext
import pandas as pd

In [8]:
# Config Spark context
spark = SparkSession.builder.appName("Banking Marketing").getOrCreate();

In [None]:
from google.colab import drive
drive.mount('/content/drive')
df = spark.read.csv("/content/drive/MyDrive/bank.csv", header=True, inferSchema=True)

In [10]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- deposit: string (nullable = true)



In [11]:
string_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact','month', 'poutcome', 'deposit']

In [12]:
# Convert String col to Numeric
from pyspark.ml.feature import StringIndexer
for i in string_features:
  indexer = StringIndexer()
  indexer.setInputCol(i).setOutputCol(i + "_indexer")
  df = indexer.fit(df).transform(df)

In [13]:
df.show()

+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+-----------+---------------+-----------------+---------------+---------------+------------+---------------+-------------+----------------+---------------+
|age|        job| marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|deposit|job_indexer|marital_indexer|education_indexer|default_indexer|housing_indexer|loan_indexer|contact_indexer|month_indexer|poutcome_indexer|deposit_indexer|
+---+-----------+--------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+-----------+---------------+-----------------+---------------+---------------+------------+---------------+-------------+----------------+---------------+
| 59|     admin.| married|secondary|     no|   2343|    yes|  no|unknown|  5|  may|    1042|       1|   -1|       0| unknown|    yes|   

In [14]:
df = df.drop(*string_features)

In [None]:
df.printSchema()

In [16]:
df.show()

+---+-------+---+--------+--------+-----+--------+-----------+---------------+-----------------+---------------+---------------+------------+---------------+-------------+----------------+---------------+
|age|balance|day|duration|campaign|pdays|previous|job_indexer|marital_indexer|education_indexer|default_indexer|housing_indexer|loan_indexer|contact_indexer|month_indexer|poutcome_indexer|deposit_indexer|
+---+-------+---+--------+--------+-----+--------+-----------+---------------+-----------------+---------------+---------------+------------+---------------+-------------+----------------+---------------+
| 59|   2343|  5|    1042|       1|   -1|       0|        3.0|            0.0|              0.0|            0.0|            1.0|         0.0|            1.0|          0.0|             0.0|            1.0|
| 56|     45|  5|    1467|       1|   -1|       0|        3.0|            0.0|              0.0|            0.0|            0.0|         0.0|            1.0|          0.0|         

In [17]:
feature_names = df.columns[:-1]

In [None]:
feature_names

In [19]:
from pyspark.ml.feature import VectorAssembler

# Use VectorAssembler to transform data into vector
assembler = VectorAssembler(inputCols=feature_names, outputCol="features")
transformed_data = assembler.transform(df)

In [20]:
transformed_data.show()

+---+-------+---+--------+--------+-----+--------+-----------+---------------+-----------------+---------------+---------------+------------+---------------+-------------+----------------+---------------+--------------------+
|age|balance|day|duration|campaign|pdays|previous|job_indexer|marital_indexer|education_indexer|default_indexer|housing_indexer|loan_indexer|contact_indexer|month_indexer|poutcome_indexer|deposit_indexer|            features|
+---+-------+---+--------+--------+-----+--------+-----------+---------------+-----------------+---------------+---------------+------------+---------------+-------------+----------------+---------------+--------------------+
| 59|   2343|  5|    1042|       1|   -1|       0|        3.0|            0.0|              0.0|            0.0|            1.0|         0.0|            1.0|          0.0|             0.0|            1.0|(16,[0,1,2,3,4,5,...|
| 56|     45|  5|    1467|       1|   -1|       0|        3.0|            0.0|              0.0|

In [21]:
# Tách train test
(training_data, test_data) = transformed_data.randomSplit([0.8,0.2])

In [22]:
from pyspark.ml.classification import LogisticRegression
model = LogisticRegression(featuresCol = 'features',labelCol='deposit_indexer', maxIter=30)

In [23]:
# Fit model
fit_model = model.fit(training_data)

In [24]:
# Dự đoán data
y_pred = fit_model.transform(test_data)

In [25]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Tính accuracy
multi_evaluator = MulticlassClassificationEvaluator(labelCol = 'deposit_indexer', metricName = 'accuracy')
print('Logistic Regression Accuracy:', multi_evaluator.evaluate(y_pred))


Logistic Regression Accuracy: 0.7877295118674429
