In [1]:
import pandas as pd
import numpy as np

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, isnan, when, count, isnull, sum, median, mean, first, max, min, mode
from pyspark.sql.types import StructType, StructField, StringType, LongType, DoubleType

In [2]:
spark = SparkSession.builder.appName("test").getOrCreate()

25/01/30 16:20:05 WARN Utils: Your hostname, Vasileioss-Laptop.local resolves to a loopback address: 127.0.0.1; using 192.168.0.67 instead (on interface en0)
25/01/30 16:20:05 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/30 16:20:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Load data

In [3]:
df=spark.read.csv('../data/cust_data.csv',inferSchema=True,header=True)

In [4]:
df.show()

+---+------+---+-------------------+---------+------------------------------+---------+--------+------------------+-------------------+-------+-----+
| id|Gender|Age|Has_Mobile_Contract|Area_Code|Currently_Holds_Second_Product|   Tenure|App_User|Num_website_visits|Acquisition_Channel|Revenue|Label|
+---+------+---+-------------------+---------+------------------------------+---------+--------+------------------+-------------------+-------+-----+
|  1|  Male| 44|                  1|       28|                             0|> 2 Years|     Yes|              NULL|                 26|  40.45|    1|
|  2|  Male| 76|                  1|        3|                             0| 1-2 Year|      No|              NULL|                 26|  33.54|    0|
|  3|  Male| 47|                  1|       28|                             0|> 2 Years|     Yes|              NULL|                 26|  38.29|    1|
|  4|  Male| 21|                  1|       11|                             1| < 1 Year|      No|    

In [5]:
print(f"Number of rows: {df.count()} and columns: {len(df.columns)}")

Number of rows: 381109 and columns: 12


In [6]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Has_Mobile_Contract: integer (nullable = true)
 |-- Area_Code: integer (nullable = true)
 |-- Currently_Holds_Second_Product: integer (nullable = true)
 |-- Tenure: string (nullable = true)
 |-- App_User: string (nullable = true)
 |-- Num_website_visits: string (nullable = true)
 |-- Acquisition_Channel: integer (nullable = true)
 |-- Revenue: double (nullable = true)
 |-- Label: integer (nullable = true)



# Handle missing values

In [7]:
df = df.drop('id', 'Num_website_visits')

In [8]:
df.describe().show()

25/01/30 16:20:10 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/Users/vasileiosvyzas/miniconda3/envs/pyspark_ml_env/lib/python3.10/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/vasileiosvyzas/miniconda3/envs/pyspark_ml_env/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/vasileiosvyzas/miniconda3/envs/pyspark_ml_env/lib/python3.10/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.prot

#
# A fatal error has been detected by the Java Runtime Environment:
#
#  SIGSEGV (0xb) at pc=0x000000010a2a5150, pid=41405, tid=71683
#
# JRE version: OpenJDK Runtime Environment Homebrew (11.0.26) (build 11.0.26+0)
# Java VM: OpenJDK 64-Bit Server VM Homebrew (11.0.26+0, mixed mode, tiered, compressed oops, g1 gc, bsd-aarch64)
# Problematic frame:
# V  [libjvm.dylib+0x695150]  ObjectSynchronizer::inflate(Thread*, oopDesc*, ObjectSynchronizer::InflateCause)+0x18c
#
# No core dump will be written. Core dumps have been disabled. To enable core dumping, try "ulimit -c unlimited" before starting Java again
#
# An error report file with more information is saved as:
# /Users/vasileiosvyzas/workspace/side-projects/pyspark-machine-learning/notebooks/hs_err_pid41405.log
#
# If you would like to submit a bug report, please visit:
#   https://github.com/Homebrew/homebrew-core/issues
#


Py4JError: An error occurred while calling o38.showString

In [None]:
df.groupby(isnull('Has_Mobile_Contract')).count().show()

In [15]:
def get_categorical_mode(df, column_name):
    return df.select(mode(column_name)).first()[0]

In [12]:
has_mobile_contract_mode = get_categorical_mode(df, 'Has_Mobile_Contract')
df = df.fillna(value=has_mobile_contract_mode, subset='Has_Mobile_Contract')

In [None]:
df.groupby(isnull('Has_Mobile_Contract')).count().show()

In [None]:
df.groupby(isnull('Tenure')).count().show()

In [19]:
tenure_mode = get_categorical_mode(df, 'Tenure')
df = df.fillna(value=tenure_mode, subset='Tenure')

In [None]:
df.groupby(isnull('Tenure')).count().show()

In [None]:
df.groupby(isnull('Area_Code')).count().show()

In [26]:
area_code_median = df.select(median('Area_Code')).first()[0]

In [None]:
type(area_code_median)

In [28]:
area_code_median = int(area_code_median)

In [None]:
type(area_code_median)

In [None]:
df = df.fillna(value=area_code_median, subset='Area_Code')

# Handle out of range values on the Age column

In [34]:
df = df.withColumn('Age', when((df['Age'] == 190) | (df['Age'] == -1), np.nan).otherwise(df.Age))

In [None]:
age_median = df.select(median('Age')).first()[0]
age_median = int(age_median)
df = df.fillna(value=age_median, subset='Age')

In [None]:
df.describe().show()