In [1]:
from pyspark.sql import SparkSession;

# warehouse_location points to the default location for managed databases and tables
from os.path import abspath
warehouse_location = abspath('spark-warehouse')

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("ISM6562 PySpark Tutorials") \
    .config("spark.sql.warehouse.dir", warehouse_location) \
    .enableHiveSupport() \
    .getOrCreate()


# Let's get the SparkContext object. It's the entry point to the Spark API. It's created when you create a sparksession
sc = spark.sparkContext

# note: If you have multiple spark sessions running (like from a previous notebook you've run), 
# this spark session webUI will be on a different port than the default (4040). One way to 
# identify this part is with the following line. If there was only one spark session running, 
# this will be 4040. If it's higher, it means there are still other spark sesssions still running.
spark_session_port = spark.sparkContext.uiWebUrl.split(":")[-1]
print("Spark Session WebUI Port: " + spark_session_port)

23/10/26 15:17:35 WARN Utils: Your hostname, localhost.localdomain resolves to a loopback address: 127.0.0.1; using 10.21.5.100 instead (on interface eth0)
23/10/26 15:17:35 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/26 15:17:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark Session WebUI Port: 4040


In [2]:
spark.sql("show tables").show()

23/10/26 15:17:39 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
23/10/26 15:17:39 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
23/10/26 15:17:43 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
23/10/26 15:17:43 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore student@127.0.0.1
23/10/26 15:17:43 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException


+---------+------------+-----------+
|namespace|   tableName|isTemporary|
+---------+------------+-----------+
|  default|fake_friends|      false|
|  default|   incidents|      false|
|  default|movieratings|      false|
|  default|      movies|      false|
+---------+------------+-----------+



In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

schema = StructType([
    StructField("movieid", IntegerType(), True),
    StructField("title", StringType(), True),
    StructField("date", StringType(), True),
    StructField("unkown", StringType(), True),
    StructField("url", StringType(), True),
    ])

df = spark.read.csv('data/incident_event_log_reduced.csv', header=True, inferSchema=True)

# display the first 5 rows of the dataframe
df.show(5)

23/10/26 15:17:46 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+----------+--------------+------+------------------+------------+-------------+--------+-----------+--------------+---------------+--------------+---------------+--------------+--------------+------------+------------+-----------+---------------+-----------+-------+----------+----------+------------+----------------+------------+---------+-----------------------+-------------+-------------+---+------+---------+-----------+---------------+---------------+--------------+
|    number|incident_state|active|reassignment_count|reopen_count|sys_mod_count|made_sla|  caller_id|     opened_by|      opened_at|sys_created_by| sys_created_at|sys_updated_by|sys_updated_at|contact_type|    location|   category|    subcategory|  u_symptom|cmdb_ci|    impact|   urgency|    priority|assignment_group| assigned_to|knowledge|u_priority_confirmation|       notify|   problem_id|rfc|vendor|caused_by|closed_code|    resolved_by|    resolved_at|     closed_at|
+----------+--------------+------+----------------

In [4]:
from pyspark.sql.functions import datediff,date_format,to_date,to_timestamp

df=df.withColumn('resolved_ts',to_timestamp(df.resolved_at, 'd/M/yyyy H:m')).\
                withColumn('opened_ts',to_timestamp(df.opened_at, 'd/M/yyyy H:m')).\
                withColumn('sys_created_ts',to_timestamp(df.sys_created_at, 'd/M/yyyy H:m')).\
                withColumn('sys_updated_ts',to_timestamp(df.sys_updated_at, 'd/M/yyyy H:m')).\
                withColumn('closed_ts',to_timestamp(df.closed_at, 'd/M/yyyy H:m')).\
                withColumn('resolved',to_date(df.resolved_at, 'd/M/yyyy H:m')).\
                withColumn('opened',to_date(df.opened_at, 'd/M/yyyy H:m')).\
                withColumn('sys_created',to_date(df.sys_created_at, 'd/M/yyyy H:m')).\
                withColumn('sys_updated',to_date(df.sys_updated_at, 'd/M/yyyy H:m')).\
                withColumn('closed',to_date(df.closed_at, 'd/M/yyyy H:m')).\
                withColumn('duration',datediff(to_date(df.resolved_at, 'd/M/yyyy H:m'),to_date(df.opened_at, 'd/M/yyyy H:m')))
df.show()

+----------+--------------+------+------------------+------------+-------------+--------+-----------+--------------+---------------+--------------+---------------+--------------+---------------+------------+------------+-----------+---------------+-----------+-------+----------+----------+------------+----------------+------------+---------+-----------------------+-------------+-------------+----------+------+---------+-----------+---------------+---------------+---------------+-------------------+-------------------+-------------------+-------------------+-------------------+----------+----------+-----------+-----------+----------+--------+
|    number|incident_state|active|reassignment_count|reopen_count|sys_mod_count|made_sla|  caller_id|     opened_by|      opened_at|sys_created_by| sys_created_at|sys_updated_by| sys_updated_at|contact_type|    location|   category|    subcategory|  u_symptom|cmdb_ci|    impact|   urgency|    priority|assignment_group| assigned_to|knowledge|u_prior

In [5]:
df.show()

+----------+--------------+------+------------------+------------+-------------+--------+-----------+--------------+---------------+--------------+---------------+--------------+---------------+------------+------------+-----------+---------------+-----------+-------+----------+----------+------------+----------------+------------+---------+-----------------------+-------------+-------------+----------+------+---------+-----------+---------------+---------------+---------------+-------------------+-------------------+-------------------+-------------------+-------------------+----------+----------+-----------+-----------+----------+--------+
|    number|incident_state|active|reassignment_count|reopen_count|sys_mod_count|made_sla|  caller_id|     opened_by|      opened_at|sys_created_by| sys_created_at|sys_updated_by| sys_updated_at|contact_type|    location|   category|    subcategory|  u_symptom|cmdb_ci|    impact|   urgency|    priority|assignment_group| assigned_to|knowledge|u_prior

In [6]:
df.write.saveAsTable("incidents", mode='overwrite')

23/10/26 15:17:50 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
23/10/26 15:17:50 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
23/10/26 15:17:50 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
23/10/26 15:17:50 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist


### Create a temp view called IM for all closed incidents.

This temp view will be used to query the data in the following sections.

In [7]:
df_unique_incidents=df.filter("incident_state=='Closed'").sort("sys_mod_count",ascending=False).dropDuplicates(["number"])

In [8]:
df_unique_incidents.createOrReplaceTempView("IM")

In [9]:
spark.sql("describe IM").show(50)

+--------------------+---------+-------+
|            col_name|data_type|comment|
+--------------------+---------+-------+
|              number|   string|   NULL|
|      incident_state|   string|   NULL|
|              active|  boolean|   NULL|
|  reassignment_count|      int|   NULL|
|        reopen_count|      int|   NULL|
|       sys_mod_count|      int|   NULL|
|            made_sla|  boolean|   NULL|
|           caller_id|   string|   NULL|
|           opened_by|   string|   NULL|
|           opened_at|   string|   NULL|
|      sys_created_by|   string|   NULL|
|      sys_created_at|   string|   NULL|
|      sys_updated_by|   string|   NULL|
|      sys_updated_at|   string|   NULL|
|        contact_type|   string|   NULL|
|            location|   string|   NULL|
|            category|   string|   NULL|
|         subcategory|   string|   NULL|
|           u_symptom|   string|   NULL|
|             cmdb_ci|   string|   NULL|
|              impact|   string|   NULL|
|             ur

### 1. Top 5 People with the most resolved incidents

In [10]:
df = spark.sql(" select resolved_by,count(number) as Incidents_Resolved from IM \
            group by resolved_by order by Incidents_Resolved desc limit 5")
df.show()

+---------------+------------------+
|    resolved_by|Incidents_Resolved|
+---------------+------------------+
| Resolved by 11|              3071|
| Resolved by 15|              2415|
|Resolved by 103|               689|
|Resolved by 177|               686|
| Resolved by 32|               597|
+---------------+------------------+



### 2. Based on least average duration, find the top 5 people with maxmium number of incidents resolved

In [11]:
spark.sql(" select resolved_by,count(number) as Incidents_Resolved,mean(duration) as Average_Duration from IM \
            group by resolved_by order by Average_Duration asc,Incidents_Resolved desc limit 5").show()

+---------------+------------------+----------------+
|    resolved_by|Incidents_Resolved|Average_Duration|
+---------------+------------------+----------------+
| Resolved by 10|                 4|             0.0|
| Resolved by 94|                 4|             0.0|
| Resolved by 26|                 2|             0.0|
|Resolved by 145|                 2|             0.0|
|Resolved by 219|                 1|             0.0|
+---------------+------------------+----------------+



### 3. People with maximum number of high impact incidents resolved

In [12]:
spark.sql(" select resolved_by,count(number) as Incidents_Resolved from IM \
            where impact = '1 - High' group by resolved_by order by Incidents_Resolved desc limit 5").show()

+---------------+------------------+
|    resolved_by|Incidents_Resolved|
+---------------+------------------+
| Resolved by 98|                20|
|Resolved by 137|                17|
| Resolved by 11|                15|
|Resolved by 165|                13|
|Resolved by 111|                12|
+---------------+------------------+



### 4a. In each impact levels, find the person with most number of incidents resolved

In [13]:
spark.sql(" select impact,resolved_by,Incidents_Resolved from (\
            select impact,resolved_by,count(number) as Incidents_Resolved,\
            row_number() over (partition by impact order by count(number) desc) as row_number from IM \
            group by impact,resolved_by order by impact asc, Incidents_Resolved desc) as rows \
            where row_number = 1").show()

+----------+--------------+------------------+
|    impact|   resolved_by|Incidents_Resolved|
+----------+--------------+------------------+
|  1 - High|Resolved by 98|                20|
|2 - Medium|Resolved by 11|              3045|
|   3 - Low|Resolved by 66|               194|
+----------+--------------+------------------+



### 4b. In each urgency levels, find the person with most number of incidents resolved

In [14]:
spark.sql(" select urgency,resolved_by,Incidents_Resolved from (\
            select urgency,resolved_by,count(number) as Incidents_Resolved,\
            row_number() over (partition by urgency order by count(number) desc) as row_number from IM \
            group by urgency,resolved_by order by urgency asc, Incidents_Resolved desc) as rows \
            where row_number = 1").show()

+----------+---------------+------------------+
|   urgency|    resolved_by|Incidents_Resolved|
+----------+---------------+------------------+
|  1 - High|Resolved by 166|                38|
|2 - Medium| Resolved by 11|              3047|
|   3 - Low| Resolved by 66|               195|
+----------+---------------+------------------+



### 4c. In each priority levels, find the person with most number of incidents resolved

In [15]:
spark.sql(" select priority,resolved_by,Incidents_Resolved from (\
            select priority,resolved_by,count(number) as Incidents_Resolved,\
            row_number() over (partition by priority order by count(number) desc) as row_number from IM \
            group by priority,resolved_by order by priority asc, Incidents_Resolved desc) as rows \
            where row_number = 1").show()

+------------+---------------+------------------+
|    priority|    resolved_by|Incidents_Resolved|
+------------+---------------+------------------+
|1 - Critical| Resolved by 98|                16|
|    2 - High|Resolved by 166|                40|
|3 - Moderate| Resolved by 11|              3040|
|     4 - Low| Resolved by 66|               195|
+------------+---------------+------------------+



### 5.  Find each contact type as a percentage of total incidents

In [16]:
spark.sql(" select contact_type,count(number) as Incidents_Reported,\
            cast(count(number)*100/sum(count(number)) over() as decimal(4,2)) as Percentage \
            from IM group by contact_type").show()

23/10/26 15:18:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/26 15:18:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/26 15:18:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/26 15:18:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/26 15:18:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/26 15:18:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/26 1

+--------------+------------------+----------+
|  contact_type|Incidents_Reported|Percentage|
+--------------+------------------+----------+
|         Phone|             24688|     99.08|
|         Email|                59|      0.24|
|  Self service|               158|      0.63|
|           IVR|                 9|      0.04|
|Direct opening|                 4|      0.02|
+--------------+------------------+----------+



23/10/26 15:18:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/26 15:18:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/26 15:18:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/10/26 15:18:00 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


### 6. On each priority level, find the percentage of incidents which made SLA and which did not.

In [17]:
spark.sql(" select priority,case when made_sla = 'false' then 'NO' else 'YES' end as made_sla,\
            count(number) as No_of_Incidents,\
            cast(count(number)*100/sum(count(number)) over(partition by priority) as decimal(4,2)) as Percentage \
            from IM group by priority,made_sla order by priority asc, made_sla desc").show()

+------------+--------+---------------+----------+
|    priority|made_sla|No_of_Incidents|Percentage|
+------------+--------+---------------+----------+
|1 - Critical|     YES|              5|      1.85|
|1 - Critical|      NO|            265|     98.15|
|    2 - High|     YES|              2|      0.49|
|    2 - High|      NO|            406|     99.51|
|3 - Moderate|     YES|          15145|     64.54|
|3 - Moderate|      NO|           8321|     35.46|
|     4 - Low|     YES|            651|     84.11|
|     4 - Low|      NO|            123|     15.89|
+------------+--------+---------------+----------+



### 7. Top 5 location with the maximum number of incidents reported

In [18]:
spark.sql(" select location,count(number) as Incidents_Reported from IM \
            group by location order by Incidents_reported desc limit 5").show()

+------------+------------------+
|    location|Incidents_Reported|
+------------+------------------+
|Location 204|              5554|
|Location 161|              4002|
|Location 143|              3276|
|Location 108|              2140|
| Location 93|              1934|
+------------+------------------+



### 8. Which category of issues missed meeting the SLA the most?

In [19]:
spark.sql(" select category,count(number) as No_Of_Incidents_missing_SLA from IM \
            where made_sla=false group by category order by No_Of_Incidents_missing_SLA desc limit 5").show()

+-----------+---------------------------+
|   category|No_Of_Incidents_missing_SLA|
+-----------+---------------------------+
|Category 46|                       1254|
|Category 26|                       1017|
|Category 53|                       1009|
|Category 42|                        689|
|Category 23|                        505|
+-----------+---------------------------+



In [20]:
spark.stop()