In [1]:
from pyspark.sql import SparkSession;

# warehouse_location points to the default location for managed databases and tables
from os.path import abspath
warehouse_location = abspath('spark-warehouse')

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("ISM6562 PySpark Tutorials") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .enableHiveSupport() \
    .getOrCreate()


# Let's get the SparkContext object. It's the entry point to the Spark API. It's created when you create a sparksession
sc = spark.sparkContext

# note: If you have multiple spark sessions running (like from a previous notebook you've run), 
# this spark session webUI will be on a different port than the default (4040). One way to 
# identify this part is with the following line. If there was only one spark session running, 
# this will be 4040. If it's higher, it means there are still other spark sesssions still running.
spark_session_port = spark.sparkContext.uiWebUrl.split(":")[-1]
print("Spark Session WebUI Port: " + spark_session_port)

23/10/26 15:23:28 WARN Utils: Your hostname, localhost.localdomain resolves to a loopback address: 127.0.0.1; using 10.21.5.100 instead (on interface eth0)
23/10/26 15:23:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/26 15:23:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark Session WebUI Port: 4040


The Incident Management dataset has about 141712 records of 24918 incidents. Each state of the incident is being captured as an individual record with few exceptions where the closed state of an incident is recorded more than once. With the help of the below segment of the code, we load and clean the Incident Management data so that only one record representing the truly closed state per incident is obtained.

In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

df = spark.read.csv('data/incident_event_log_reduced.csv', header=True, inferSchema=True)

# display the first 5 rows of the dataframe
df.show(5)

23/10/26 15:23:35 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+----------+--------------+------+------------------+------------+-------------+--------+-----------+--------------+---------------+--------------+---------------+--------------+--------------+------------+------------+-----------+---------------+-----------+-------+----------+----------+------------+----------------+------------+---------+-----------------------+-------------+-------------+---+------+---------+-----------+---------------+---------------+--------------+
|    number|incident_state|active|reassignment_count|reopen_count|sys_mod_count|made_sla|  caller_id|     opened_by|      opened_at|sys_created_by| sys_created_at|sys_updated_by|sys_updated_at|contact_type|    location|   category|    subcategory|  u_symptom|cmdb_ci|    impact|   urgency|    priority|assignment_group| assigned_to|knowledge|u_priority_confirmation|       notify|   problem_id|rfc|vendor|caused_by|closed_code|    resolved_by|    resolved_at|     closed_at|
+----------+--------------+------+----------------

The data set has multiple states(New, Active, Awaiting user info, Resolved, Closed etc. ) of an incident. With the help of the below command, we are just filtering one record per incident, that has the truly closed state of the incident. 

In [3]:
df_unique_incidents=df.filter("incident_state=='Closed'").sort("sys_mod_count",ascending=False).dropDuplicates(["number"])

Selecting the dependent and the independent variables that are identified as most useful attributes to make predictions

In [4]:
data=df_unique_incidents.select([
    'caller_id',
    'opened_by',
    'location',
    'category',
    'subcategory',
    'u_symptom',
    'assignment_group',
    'priority'
    ]
)
data=data.dropna()

Create a 70-30 train test split

In [5]:
train_data,test_data=data.randomSplit([0.7,0.3])

## Building the Decision Tree Classifier

In [6]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler,StringIndexer
from pyspark.ml import Pipeline

Use StringIndexer to convert the categorical columns to hold numerical data

In [7]:
caller_id_indexer = StringIndexer(inputCol='caller_id',outputCol='caller_id_index',handleInvalid='keep')
opened_by_indexer = StringIndexer(inputCol='opened_by',outputCol='opened_by_index',handleInvalid='keep')
location_indexer = StringIndexer(inputCol='location',outputCol='location_index',handleInvalid='keep')
category_indexer = StringIndexer(inputCol='category',outputCol='category_index',handleInvalid='keep')
subcategory_indexer = StringIndexer(inputCol='subcategory',outputCol='subcategory_index',handleInvalid='keep')
u_symptom_indexer = StringIndexer(inputCol='u_symptom',outputCol='u_symptom_index',handleInvalid='keep')
assignment_group_indexer = StringIndexer(inputCol='assignment_group',outputCol='assignment_group_index',handleInvalid='keep')
priority_indexer = StringIndexer(inputCol='priority',outputCol='priority_index',handleInvalid='keep')

Vector assembler is used to create a vector of input features

In [8]:
assembler = VectorAssembler(
    inputCols=[
        'caller_id_index',
        'opened_by_index',
        'location_index',
        'category_index',
        'subcategory_index',
        'u_symptom_index',
        'assignment_group_index'
    ],
    outputCol="features"
)

Create an object for the Logistic Regression model. Use the parameter maxBins and assign a value that is equal to or more than the number of categories in any sigle feature

In [9]:
dt_model = DecisionTreeClassifier(labelCol='priority_index',maxBins=5000)

Pipeline is used to pass the data through indexer and assembler simultaneously. Also, it helps to pre-rocess the test data in the same way as that of the train data.

In [10]:
pipe = Pipeline(
    stages=[
        caller_id_indexer,
        opened_by_indexer,
        location_indexer,
        category_indexer,
        subcategory_indexer,
        u_symptom_indexer,
        assignment_group_indexer,
        priority_indexer,
        assembler,
        dt_model
    ]
)

In [11]:
fit_model=pipe.fit(train_data)

                                                                                

Store the results in a dataframe

In [12]:
results = fit_model.transform(test_data)

In [13]:
results.select(['priority_index','prediction']).show()

+--------------+----------+
|priority_index|prediction|
+--------------+----------+
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
|           1.0|       0.0|
|           0.0|       0.0|
|           0.0|       0.0|
+--------------+----------+
only showing top 20 rows



Evaluating the model

In [14]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [15]:
ACC_evaluator = MulticlassClassificationEvaluator(
    labelCol="priority_index", predictionCol="prediction", metricName="accuracy")

accuracy = ACC_evaluator.evaluate(results)

print(f"The accuracy of the decision tree classifier is {accuracy}")

The accuracy of the decision tree classifier is 0.9429530201342282


NOTE: If you wish to look at other model evaluation metrics, see previous notebooks in this series for examples of f1_score, precision, recall, areaUnderROC, and areaUnderPR.

In [16]:
spark.stop()