In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

# Dataset Information:

This dataset consists of Monte Carlo (MC) generated data simulating high-energy gamma particle registration in a Cherenkov gamma telescope using imaging techniques. The telescope detects gamma rays by capturing the Cherenkov radiation emitted by charged particles formed in electromagnetic showers initiated by gamma interactions in the atmosphere.

The recorded data include pulses from Cherenkov photons impacting the photomultiplier tubes arranged in a plane (the camera). Depending on the gamma energy, anywhere from a few hundred to 10,000 photons are collected, forming a shower image that helps distinguish between gamma-initiated showers (signal) and hadronic showers caused by cosmic rays (background).

After pre-processing, the shower image generally appears as an elongated cluster, with its long axis pointing toward the camera center if the telescope is aligned with a point source. A principal component analysis (PCA) is performed to determine correlation axes and define an ellipse, aiding in classification. Features such as Hillas parameters, asymmetry along the major axis, and cluster extent further assist in discrimination.

The data was produced by the Monte Carlo simulation program Corsika, detailed in:

D. Heck et al., CORSIKA: A Monte Carlo Code to Simulate Extensive Air Showers, Forschungszentrum Karlsruhe FZKA 6019 (1998).
http://rexa.info/paper?id=ac6e674e9af20979b23d3ed4521f1570765e8d68
Simulation parameters enabled the detection of events with **energies below 50 GeV

src:

[https://archive.ics.uci.edu/dataset/159/magic+gamma+telescope]

In [None]:
cols = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]
url = ""
df = pd.read_csv(url, names=cols)
df.head()

In [None]:
df["class"].unique()

In [None]:
df["class"] = (df["class"] == "g").astype(int)
df.head()

Class Values
The only class values are either "g" for gamma particles or "h" for hadron particles. In order to assist in computation of our data, these will be converted to binary values, 0 for hadrons and 1 for gamma.

We will use the features of the DataFrame to determine wheather a recorded particle is a gamma or a hadron particle.

In [None]:
for col in cols[:-1]:
    plt.hist(df[df["class"]==1][col], color="blue", label="Gamma-Ray Particles", alpha=0.7, density=True)
    plt.hist(df[df["class"]==0][col], color="red", label="Hadronic Particles", alpha=0.7, density=True)
    plt.title(col)
    plt.ylabel("Probability")
    plt.xlabel(col)
    plt.legend()
    plt.show()

Train, Validation and Test datasets

In [None]:
train, val, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])

In [None]:
def scale_dataset(dataframe, oversample=False):
    X = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values

    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    if oversample:
        ros = RandomOverSampler()
        X, y = ros.fit_resample(X, y)

    data = np.hstack((X, np.reshape(y, (-1, 1))))
    return data, X, y

In [None]:
print("Gamma", len(train[train["class"]==1]))
print("Hadron:", len(train[train["class"]==0]))

In [None]:
train, X_train, y_train = scale_dataset(train, oversample=True)
val, X_val, y_val = scale_dataset(val, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)

In [None]:
print("Total:", len(y_train))
print("Gamma:", sum(y_train==1))
print("Hadron:", sum(y_train==0))

K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

y_predicts = knn_model.predict(X_test)
print(classification_report(y_test, y_predicts))