In [1]:
#Import Libraries

from pyspark.sql import SparkSession

# Create Spark Context
from pyspark.sql import functions as func # To use 'sum', 'count', and other functions
from pyspark.sql.types import IntegerType, DoubleType

### Pandas vs PySpark
The complexity of Data Processing Tasks: PySpark is more suitable for complex data processing tasks that involve multiple stages of data transformation and analysis. Pandas is more suitable for simple data analysis tasks that involve filtering, selecting, and aggregating data.

In [3]:
import pandas as pd # Python library to manage dataframes, similar as PySpark

In [4]:
spark = SparkSession.builder.getOrCreate()

In [6]:
bank_data = spark.read.option('header','true').options(delimiter=";").csv('bank.csv', inferSchema=True)
bank_data.show()
# NOTE:
# What is inferSchema=True???
# Infer schema will automatically guess the data types for each field. If we set this option to TRUE, the API will read some sample records from the file to infer the schema. If we want to set this value to false, we must specify a schema explicitly

+---+-------------+-------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---+
|age|          job|marital|education|default|balance|housing|loan| contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+-------------+-------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---+
| 30|   unemployed|married|  primary|     no|   1787|     no|  no|cellular| 19|  oct|      79|       1|   -1|       0| unknown| no|
| 33|     services|married|secondary|     no|   4789|    yes| yes|cellular| 11|  may|     220|       1|  339|       4| failure| no|
| 35|   management| single| tertiary|     no|   1350|    yes|  no|cellular| 16|  apr|     185|       1|  330|       1| failure| no|
| 30|   management|married| tertiary|     no|   1476|    yes| yes| unknown|  3|  jun|     199|       4|   -1|       0| unknown| no|
| 59|  blue-collar|married|secondary|     no|      0|    yes|  no| unknown| 

In [7]:
# Showing the type of each column

bank_data.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- y: string (nullable = true)



In [8]:
# Showing the first 5 rows of data

bank_data.head(5)

[Row(age=30, job='unemployed', marital='married', education='primary', default='no', balance=1787, housing='no', loan='no', contact='cellular', day=19, month='oct', duration=79, campaign=1, pdays=-1, previous=0, poutcome='unknown', y='no'),
 Row(age=33, job='services', marital='married', education='secondary', default='no', balance=4789, housing='yes', loan='yes', contact='cellular', day=11, month='may', duration=220, campaign=1, pdays=339, previous=4, poutcome='failure', y='no'),
 Row(age=35, job='management', marital='single', education='tertiary', default='no', balance=1350, housing='yes', loan='no', contact='cellular', day=16, month='apr', duration=185, campaign=1, pdays=330, previous=1, poutcome='failure', y='no'),
 Row(age=30, job='management', marital='married', education='tertiary', default='no', balance=1476, housing='yes', loan='yes', contact='unknown', day=3, month='jun', duration=199, campaign=4, pdays=-1, previous=0, poutcome='unknown', y='no'),
 Row(age=59, job='blue-coll

## TASK 1

### Calculate the Mean, Median, and Standard Deviation of all the variables/attributes of numeric type:

**Getting a Database:** Once you have a connected aninstance of MngoClient, you can access any database managed by the specified MngoDB server. To define which database you want to use, you can use the dot notation.

### Method 1

### Using 'describe' function to provide the basic statisticsa

In [11]:
bank_data.select('age', 'balance', 'day', 'duration', 'pdays', 'previous').describe().show()

+-------+------------------+------------------+------------------+------------------+------------------+------------------+
|summary|               age|           balance|               day|          duration|             pdays|          previous|
+-------+------------------+------------------+------------------+------------------+------------------+------------------+
|  count|              4521|              4521|              4521|              4521|              4521|              4521|
|   mean| 41.17009511170095|1422.6578190665782|15.915284229152842|263.96129174961294|39.766644547666445|0.5425790754257908|
| stddev|10.576210958711263|3009.6381424673395| 8.247667327229934|259.85663262468216|100.12112444301656|1.6935623506071211|
|    min|                19|             -3313|                 1|                 4|                -1|                 0|
|    max|                87|             71188|                31|              3025|               871|                25|
+-------

### Method 2

In [12]:
bank_data.select(func.stddev('age')).show()

+------------------+
|       stddev(age)|
+------------------+
|10.576210958711263|
+------------------+



In [13]:
bank_data.select(func.mean('age')).show()

+-----------------+
|         avg(age)|
+-----------------+
|41.17009511170095|
+-----------------+



In [14]:
bank_data.select(func.max('age')).show()

+--------+
|max(age)|
+--------+
|      87|
+--------+



In [24]:
bank_data.select(func.min('age')).show()

+--------+
|min(age)|
+--------+
|      19|
+--------+



In [33]:
#Use describe() for summary statistics
bank_data.describe().show()
#Mean of all columns
#bank_data.select([func.mean(c) for c in bank_data.columns]).first()

+-------+------------------+-------+--------+---------+-------+------------------+-------+----+--------+------------------+-----+------------------+------------------+------------------+------------------+--------+----+
|summary|               age|    job| marital|education|default|           balance|housing|loan| contact|               day|month|          duration|          campaign|             pdays|          previous|poutcome|   y|
+-------+------------------+-------+--------+---------+-------+------------------+-------+----+--------+------------------+-----+------------------+------------------+------------------+------------------+--------+----+
|  count|              4521|   4521|    4521|     4521|   4521|              4521|   4521|4521|    4521|              4521| 4521|              4521|              4521|              4521|              4521|    4521|4521|
|   mean| 41.17009511170095|   NULL|    NULL|     NULL|   NULL|1422.6578190665782|   NULL|NULL|    NULL|15.9152842291528