# EDA for Higgs Boson Dataset

## Import packages

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
import os

## Create SparkContext and SparkSession

In [2]:
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/09 05:37:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Read CSV, transform and load to Dataframe

In [3]:
RAW_DATA_FOLDER = "raw_data"
rdd = sc.textFile(os.path.join(RAW_DATA_FOLDER, "HIGGS.csv"))
rdd = rdd.map(
    lambda row: [
        int(float(v)) if i == 0 else float(v) for i, v in enumerate(row.split(","))
    ]
)
df = rdd.toDF(
    schema=[
        "signal",
        "lepton pT",
        "lepton eta",
        "lepton phi",
        "missing energy magnitude",
        "missing energy phi",
        "jet 1 pt",
        "jet 1 eta",
        "jet 1 phi",
        "jet 1 b-tag",
        "jet 2 pt",
        "jet 2 eta",
        "jet 2 phi",
        "jet 2 b-tag",
        "jet 3 pt",
        "jet 3 eta",
        "jet 3 phi",
        "jet 3 b-tag",
        "jet 4 pt",
        "jet 4 eta",
        "jet 4 phi",
        "jet 4 b-tag",
        "m_jj",
        "m_jjj",
        "m_lv",
        "m_jlv",
        "m_bb",
        "m_wbb",
        "m_wwbb",
    ]
)

                                                                                

## Feature Analysis

In [4]:
total_rows = df.count()
print(f"Total instances - {total_rows}")

24/04/09 05:37:57 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
24/04/09 05:38:10 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors

Total instances - 11000000


                                                                                

In [5]:
col_name = "lepton pT"
limit = 4.0
above = True
count = 0
if above:
    count += df.select(col_name).filter(df[col_name] > limit).count()
else:
    count += df.select(col_name).filter(df[col_name] < limit).count()
print(
    f"Count of instances with ({col_name} {'>' if above else '<'} {limit}) - {count}"
)
print(
    f"Percentage of instances with ({col_name} {'>' if above else '<'} {limit}) - {(count * 100.0/ total_rows):.4f}%"
)
# clip values above 4.0 to 4.0



Count of instances with (lepton pT > 4.0) - 17333
Percentage of instances with (lepton pT > 4.0) - 0.1576%


                                                                                

In [6]:
col_name = "missing energy magnitude"
limit = 4.0
above = True
count = 0
if above:
    count += df.select(col_name).filter(df[col_name] > limit).count()
else:
    count += df.select(col_name).filter(df[col_name] < limit).count()
print(
    f"Count of instances with ({col_name} {'>' if above else '<'} {limit}) - {count}"
)
print(
    f"Percentage of instances with ({col_name} {'>' if above else '<'} {limit}) - {(count * 100.0/ total_rows):.4f}%"
)
# clip values above 4.0 to 4.0



Count of instances with (missing energy magnitude > 4.0) - 20013
Percentage of instances with (missing energy magnitude > 4.0) - 0.1819%


                                                                                

In [7]:
col_name = "jet 1 pT"
limit = 4.0
above = True
count = 0
if above:
    count += df.select(col_name).filter(df[col_name] > limit).count()
else:
    count += df.select(col_name).filter(df[col_name] < limit).count()
print(
    f"Count of instances with ({col_name} {'>' if above else '<'} {limit}) - {count}"
)
print(
    f"Percentage of instances with ({col_name} {'>' if above else '<'} {limit}) - {(count * 100.0/ total_rows):.4f}%"
)
# clip values above 4.0 to 4.0



Count of instances with (jet 1 pT > 4.0) - 5647
Percentage of instances with (jet 1 pT > 4.0) - 0.0513%


                                                                                

In [8]:
df.select("jet 1 b-tag").distinct().show()
# categorical variable, replace with 0, 1, 2 respectively

                                                                                

+------------------+
|       jet 1 b-tag|
+------------------+
|               0.0|
|1.0865380764007568|
|2.1730761528015137|
+------------------+



In [9]:
col_name = "jet 2 pT"
limit = 4.0
above = True
count = 0
if above:
    count += df.select(col_name).filter(df[col_name] > limit).count()
else:
    count += df.select(col_name).filter(df[col_name] < limit).count()
print(
    f"Count of instances with ({col_name} {'>' if above else '<'} {limit}) - {count}"
)
print(
    f"Percentage of instances with ({col_name} {'>' if above else '<'} {limit}) - {(count * 100.0/ total_rows):.4f}%"
)
# clip values above 4.0 to 4.0



Count of instances with (jet 2 pT > 4.0) - 13698
Percentage of instances with (jet 2 pT > 4.0) - 0.1245%


                                                                                

In [10]:
df.select("jet 2 b-tag").distinct().show()
# categorical variable, replace with 0, 1, 2 respectively



+------------------+
|       jet 2 b-tag|
+------------------+
|1.1074360609054565|
|               0.0|
| 2.214872121810913|
+------------------+



                                                                                

In [11]:
col_name = "jet 3 pT"
limit = 4.0
above = True
count = 0
if above:
    count += df.select(col_name).filter(df[col_name] > limit).count()
else:
    count += df.select(col_name).filter(df[col_name] < limit).count()
print(
    f"Count of instances with ({col_name} {'>' if above else '<'} {limit}) - {count}"
)
print(
    f"Percentage of instances with ({col_name} {'>' if above else '<'} {limit}) - {(count * 100.0/ total_rows):.4f}%"
)
# clip values above 4.0 to 4.0



Count of instances with (jet 3 pT > 4.0) - 9826
Percentage of instances with (jet 3 pT > 4.0) - 0.0893%


                                                                                

In [12]:
df.select("jet 3 b-tag").distinct().show()
# categorical variable, replace with 0, 1, 2 respectively



+------------------+
|       jet 3 b-tag|
+------------------+
| 2.548224449157715|
|               0.0|
|1.2741122245788574|
+------------------+



                                                                                

In [13]:
col_name = "jet 4 pT"
limit = 4.0
above = True
count = 0
if above:
    count += df.select(col_name).filter(df[col_name] > limit).count()
else:
    count += df.select(col_name).filter(df[col_name] < limit).count()
print(
    f"Count of instances with ({col_name} {'>' if above else '<'} {limit}) - {count}"
)
print(
    f"Percentage of instances with ({col_name} {'>' if above else '<'} {limit}) - {(count * 100.0/ total_rows):.4f}%"
)
# clip values above 4.0 to 4.0



Count of instances with (jet 4 pT > 4.0) - 9646
Percentage of instances with (jet 4 pT > 4.0) - 0.0877%


                                                                                

In [14]:
df.select("jet 4 b-tag").distinct().show()
# categorical variable, replace with 0, 1, 2 respectively



+------------------+
|       jet 4 b-tag|
+------------------+
|               0.0|
| 3.101961374282837|
|1.5509806871414185|
+------------------+



                                                                                

In [15]:
col_name = "m_jj"
limit = 7.0
above = True
count = 0
if above:
    count += df.select(col_name).filter(df[col_name] > limit).count()
else:
    count += df.select(col_name).filter(df[col_name] < limit).count()
print(
    f"Count of instances with ({col_name} {'>' if above else '<'} {limit}) - {count}"
)
print(
    f"Percentage of instances with ({col_name} {'>' if above else '<'} {limit}) - {(count * 100.0/ total_rows):.4f}%"
)
# clip values above 7.0 to 7.0



Count of instances with (m_jj > 7.0) - 17754
Percentage of instances with (m_jj > 7.0) - 0.1614%


                                                                                

In [16]:
col_name = "m_jjj"
limit = 4.0
above = True
count = 0
if above:
    count += df.select(col_name).filter(df[col_name] > limit).count()
else:
    count += df.select(col_name).filter(df[col_name] < limit).count()
print(
    f"Count of instances with ({col_name} {'>' if above else '<'} {limit}) - {count}"
)
print(
    f"Percentage of instances with ({col_name} {'>' if above else '<'} {limit}) - {(count * 100.0/ total_rows):.4f}%"
)
# clip values above 4.0 to 4.0



Count of instances with (m_jjj > 4.0) - 19962
Percentage of instances with (m_jjj > 4.0) - 0.1815%


                                                                                

In [23]:
col_name = "m_lv"
limit = 2.5
above = True
count = 0
if above:
    count += df.select(col_name).filter(df[col_name] > limit).count()
else:
    count += df.select(col_name).filter(df[col_name] < limit).count()
print(
    f"Count of instances with ({col_name} {'>' if above else '<'} {limit}) - {count}"
)
print(
    f"Percentage of instances with ({col_name} {'>' if above else '<'} {limit}) - {(count * 100.0/ total_rows):.4f}%"
)
# clip values above 2.5 to 2.5



Count of instances with (m_lv > 2.5) - 10620
Percentage of instances with (m_lv > 2.5) - 0.0965%


                                                                                

In [18]:
col_name = "m_jlv"
limit = 4.0
above = True
count = 0
if above:
    count += df.select(col_name).filter(df[col_name] > limit).count()
else:
    count += df.select(col_name).filter(df[col_name] < limit).count()
print(
    f"Count of instances with ({col_name} {'>' if above else '<'} {limit}) - {count}"
)
print(
    f"Percentage of instances with ({col_name} {'>' if above else '<'} {limit}) - {(count * 100.0/ total_rows):.4f}%"
)
# clip values above 4.0 to 4.0



Count of instances with (m_jlv > 4.0) - 10752
Percentage of instances with (m_jlv > 4.0) - 0.0977%


                                                                                

In [22]:
col_name = "m_bb"
limit = 4.5
above = True
count = 0
if above:
    count += df.select(col_name).filter(df[col_name] > limit).count()
else:
    count += df.select(col_name).filter(df[col_name] < limit).count()
print(
    f"Count of instances with ({col_name} {'>' if above else '<'} {limit}) - {count}"
)
print(
    f"Percentage of instances with ({col_name} {'>' if above else '<'} {limit}) - {(count * 100.0/ total_rows):.4f}%"
)
# clip values above 4.5 to 4.5



Count of instances with (m_bb > 4.5) - 13386
Percentage of instances with (m_bb > 4.5) - 0.1217%


                                                                                

In [20]:
col_name = "m_wbb"
limit = 3.5
above = True
count = 0
if above:
    count += df.select(col_name).filter(df[col_name] > limit).count()
else:
    count += df.select(col_name).filter(df[col_name] < limit).count()
print(
    f"Count of instances with ({col_name} {'>' if above else '<'} {limit}) - {count}"
)
print(
    f"Percentage of instances with ({col_name} {'>' if above else '<'} {limit}) - {(count * 100.0/ total_rows):.4f}%"
)
# clip values above 3.5 to 3.5



Count of instances with (m_wbb > 3.5) - 14441
Percentage of instances with (m_wbb > 3.5) - 0.1313%


                                                                                

In [21]:
col_name = "m_wwbb"
limit = 3.0
above = True
count = 0
if above:
    count += df.select(col_name).filter(df[col_name] > limit).count()
else:
    count += df.select(col_name).filter(df[col_name] < limit).count()
print(
    f"Count of instances with ({col_name} {'>' if above else '<'} {limit}) - {count}"
)
print(
    f"Percentage of instances with ({col_name} {'>' if above else '<'} {limit}) - {(count * 100.0/ total_rows):.4f}%"
)
# clip values above 3.0 to 3.0



Count of instances with (m_wwbb > 3.0) - 14854
Percentage of instances with (m_wwbb > 3.0) - 0.1350%


                                                                                