In [1]:
from pyspark.sql import SparkSession;

# warehouse_location points to the default location for managed databases and tables
from os.path import abspath
warehouse_location = abspath('spark-warehouse')

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("ISM6562 PySpark Tutorials") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .enableHiveSupport() \
    .getOrCreate()


# Let's get the SparkContext object. It's the entry point to the Spark API. It's created when you create a sparksession
sc = spark.sparkContext

# note: If you have multiple spark sessions running (like from a previous notebook you've run), 
# this spark session webUI will be on a different port than the default (4040). One way to 
# identify this part is with the following line. If there was only one spark session running, 
# this will be 4040. If it's higher, it means there are still other spark sesssions still running.
spark_session_port = spark.sparkContext.uiWebUrl.split(":")[-1]
print("Spark Session WebUI Port: " + spark_session_port)

23/10/26 15:20:07 WARN Utils: Your hostname, localhost.localdomain resolves to a loopback address: 127.0.0.1; using 10.21.5.100 instead (on interface eth0)
23/10/26 15:20:07 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/26 15:20:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark Session WebUI Port: 4040


## Logistic Regression to predict whether an incident met SLA

The Incident Management dataset has about 141712 records of 24918 incidents. Each state of the incident is being captured as an individual record with few exceptions where the closed state of an incident is recorded more than once. With the help of the below segment of the code, we load and clean the Incident Management data so that only one record representing the truly closed state per incident is obtained.

In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

df = spark.read.csv('data/incident_event_log_reduced.csv', header=True, inferSchema=True)

# display the first 5 rows of the dataframe
df.show(5)

23/10/26 15:20:14 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+----------+--------------+------+------------------+------------+-------------+--------+-----------+--------------+---------------+--------------+---------------+--------------+--------------+------------+------------+-----------+---------------+-----------+-------+----------+----------+------------+----------------+------------+---------+-----------------------+-------------+-------------+---+------+---------+-----------+---------------+---------------+--------------+
|    number|incident_state|active|reassignment_count|reopen_count|sys_mod_count|made_sla|  caller_id|     opened_by|      opened_at|sys_created_by| sys_created_at|sys_updated_by|sys_updated_at|contact_type|    location|   category|    subcategory|  u_symptom|cmdb_ci|    impact|   urgency|    priority|assignment_group| assigned_to|knowledge|u_priority_confirmation|       notify|   problem_id|rfc|vendor|caused_by|closed_code|    resolved_by|    resolved_at|     closed_at|
+----------+--------------+------+----------------

In [3]:
from pyspark.sql.functions import datediff,date_format,to_date,to_timestamp

import pyspark.sql.functions as f

df=df.withColumn('resolved_ts',to_timestamp(df.resolved_at, 'dd/MM/yyyy HH:mm')).\
        withColumn('opened_ts',to_timestamp(df.opened_at, 'dd/MM/yyyy HH:mm')).\
        withColumn('sys_created_ts',to_timestamp(df.sys_created_at, 'dd/MM/yyyy HH:mm')).\
        withColumn('sys_updated_ts',to_timestamp(df.sys_updated_at, 'dd/MM/yyyy HH:mm')).\
        withColumn('closed_ts',to_timestamp(df.closed_at, 'dd/MM/yyyy HH:mm')).\
        withColumn('resolved',to_date(df.resolved_at, 'dd/MM/yyyy HH:mm')).\
        withColumn('opened',to_date(df.opened_at, 'dd/MM/yyyy HH:mm')).\
        withColumn('sys_created',to_date(df.sys_created_at, 'dd/MM/yyyy HH:mm')).\
        withColumn('sys_updated',to_date(df.sys_updated_at, 'dd/MM/yyyy HH:mm')).\
        withColumn('closed',to_date(df.closed_at, 'dd/MM/yyyy HH:mm')).\
        withColumn('knowledge', f.col('knowledge').cast('string')).\
        replace(['TRUE',], 'True', subset='knowledge').\
        replace(['FALSE'], 'False', subset='knowledge').\
        withColumn('resolved_duration',datediff(to_date(df.resolved_at, 'dd/MM/yyyy HH:mm'),\
                                                to_date(df.opened_at, 'dd/MM/yyyy HH:mm'))).\
        withColumn('closed_duration',datediff(to_date(df.closed_at, 'dd/MM/yyyy HH:mm'),\
                                                to_date(df.opened_at, 'dd/MM/yyyy HH:mm'))).\
        withColumn('made_sla_int',df.made_sla.cast('integer'))

In [4]:
# The data set has multiple states(New, Active, Awaiting user info, Resolved, Closed etc. ) of an incident. With the help 
# of the below command, we are just filtering one record per incident, that has the truly closed state of the incident. 

spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")

df_unique_incidents=df.filter("incident_state=='Closed'").\
    sort("sys_mod_count",ascending=False).\
    dropDuplicates(["number"])


df_unique_incidents.show(5)

[Stage 6:>                                                          (0 + 4) / 4]

+----------+--------------+------+------------------+------------+-------------+--------+-----------+--------------+---------------+--------------+---------------+--------------+--------------+------------+------------+-----------+---------------+-----------+-------+----------+----------+------------+----------------+------------+---------+-----------------------+-------------+-------------+---+------+---------+-----------+---------------+---------------+--------------+-------------------+-------------------+-------------------+-------------------+-------------------+----------+----------+-----------+-----------+----------+-----------------+---------------+------------+
|    number|incident_state|active|reassignment_count|reopen_count|sys_mod_count|made_sla|  caller_id|     opened_by|      opened_at|sys_created_by| sys_created_at|sys_updated_by|sys_updated_at|contact_type|    location|   category|    subcategory|  u_symptom|cmdb_ci|    impact|   urgency|    priority|assignment_group| a

                                                                                

Select the dependent and the independent variables that are identified as most useful attributes to make predictions.

In [5]:
data=df_unique_incidents.select([
    'sys_mod_count',
    'opened_by',
    'location',
    'category',
    'priority',
    'assignment_group',
    'knowledge',
    'resolved_duration',
    'closed_duration',
    'made_sla_int'
    ]
)

data=data.dropna()

Create a 70-30 train test split

In [6]:
train_data,test_data=data.randomSplit([0.7,0.3])

## Building Logistic Model

In [7]:
# Import the required libraries
 
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler,StringIndexer ,OneHotEncoder
from pyspark.ml import Pipeline

Use StringIndexer to convert the categorical columns to hold numerical data

In [8]:
opened_by_indexer = StringIndexer(inputCol='opened_by',outputCol='opened_by_index',handleInvalid='keep')
location_indexer = StringIndexer(inputCol='location',outputCol='location_index',handleInvalid='keep')
category_indexer = StringIndexer(inputCol='category',outputCol='category_index',handleInvalid='keep')
priority_indexer = StringIndexer(inputCol='priority',outputCol='priority_index',handleInvalid='keep')
assignment_group_indexer = StringIndexer(inputCol='assignment_group',outputCol='assignment_group_index',handleInvalid='keep')
knowledge_indexer = StringIndexer(inputCol='knowledge',outputCol='knowledge_index',handleInvalid='keep')

OneHotEncoderEstimator converts the indexed data into a vector which will be effectively handled by Logistic Regression model.

In [9]:
data_encoder = OneHotEncoder(
    inputCols=[
        'opened_by_index',
        'location_index',
        'category_index',
        'priority_index',
        'assignment_group_index',
        'knowledge_index'
    ], 
    outputCols= [
        'opened_by_vec',
        'location_vec',
        'category_vec',
        'priority_vec',
        'assignment_group_vec',
        'knowledge_vec'],
    handleInvalid='keep'
)

Vector assembler is used to create a vector of input features

In [10]:
assembler = VectorAssembler(
    inputCols=[
        "opened_by_vec",
        'location_vec',
        'category_vec',
        'priority_vec',
        'assignment_group_vec',
        'knowledge_vec'
        ],
    outputCol="features"
)

Create an object for the Logistic Regression model

In [11]:
lr_model = LogisticRegression(labelCol='made_sla_int')

Pipeline is used to pass the data through indexer and assembler simultaneously. Also, it helps to pre-rocess the test data in the same way as that of the train data. It also 

In [12]:
pipe = Pipeline(
    stages=[
        opened_by_indexer,
        location_indexer,
        category_indexer,
        priority_indexer,
        assignment_group_indexer,
        knowledge_indexer,
        data_encoder,
        assembler,
        lr_model
    ]
)
  

In [13]:
# run the pipeline
fit_model=pipe.fit(train_data)

# Store the results in a dataframe
results = fit_model.transform(test_data)

                                                                                

In [14]:
results.select(['made_sla_int','prediction']).show()

+------------+----------+
|made_sla_int|prediction|
+------------+----------+
|           1|       1.0|
|           1|       1.0|
|           1|       1.0|
|           1|       1.0|
|           1|       1.0|
|           1|       1.0|
|           1|       1.0|
|           0|       1.0|
|           1|       1.0|
|           1|       1.0|
|           1|       1.0|
|           1|       1.0|
|           1|       1.0|
|           1|       1.0|
|           1|       1.0|
|           1|       1.0|
|           1|       1.0|
|           1|       1.0|
|           1|       1.0|
|           0|       0.0|
+------------+----------+
only showing top 20 rows



## Model Evaluation

### Area under the ROC

In [15]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

AUC_evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='made_sla_int',metricName='areaUnderROC')

AUC = AUC_evaluator.evaluate(results)

In [16]:
print("The area under the curve is {}".format(AUC))

The area under the curve is 0.7281694544532962


A roughly 73% area under ROC denotes the model has performed reasonably well in predicting whether an incident has met the sla.

### Area under the PR

In [17]:
PR_evaluator = BinaryClassificationEvaluator(rawPredictionCol='prediction',labelCol='made_sla_int',metricName='areaUnderPR')
PR = PR_evaluator.evaluate(results)

In [18]:
print("The area under the PR curve is {}".format(PR))

The area under the PR curve is 0.7610424679464421


### Accuracy

In [19]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

ACC_evaluator = MulticlassClassificationEvaluator(  #  Multiclass or Binary, the accuracy is calculated in the same way.
    labelCol="made_sla_int", predictionCol="prediction", metricName="accuracy")

accuracy = ACC_evaluator.evaluate(results)

In [20]:
print("The accuracy of the model is {}".format(accuracy))

The accuracy of the model is 0.7578571428571429


### Confusion Matrix

In [21]:
from sklearn.metrics import confusion_matrix

In [22]:
y_true = results.select("made_sla_int")
y_true = y_true.toPandas()
 
y_pred = results.select("prediction")
y_pred = y_pred.toPandas()
 
cnf_matrix = confusion_matrix(y_true, y_pred)


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dty

In [23]:
print("Below is the confusion matrix \n {}".format(cnf_matrix))

Below is the confusion matrix 
 [[1616 1088]
 [ 607 3689]]


In [24]:
tn = cnf_matrix[0][0]
fp = cnf_matrix[0][1]
fn = cnf_matrix[1][0]
tp = cnf_matrix[1][1]

accuracy = (tp+tn)/(tp+tn+fp+fn)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1_score = 2*(precision*recall)/(precision+recall)

In [25]:
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1_score:.2f}")


Accuracy: 0.76
Precision: 0.77
Recall: 0.86
F1 Score: 0.81


In [26]:
spark.stop()