In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

The data set was generated by a Monte Carlo program, Corsika, described in:

    D. Heck et al., CORSIKA, A Monte Carlo code to simulate extensive air showers,

    Forschungszentrum Karlsruhe FZKA 6019 (1998).

http://rexa.info/paper?id=ac6e674e9af20979b23d3ed4521f1570765e8d68


```
    1.  fLength:  continuous  # major axis of ellipse [mm]

    2.  fWidth:   continuous  # minor axis of ellipse [mm]

    3.  fSize:    continuous  # 10-log of sum of content of all pixels [in #phot]

    4.  fConc:    continuous  # ratio of sum of two highest pixels over fSize  [ratio]

    5.  fConc1:   continuous  # ratio of highest pixel over fSize  [ratio]

    6.  fAsym:    continuous  # distance from highest pixel to center, projected onto major axis [mm]

    7.  fM3Long:  continuous  # 3rd root of third moment along major axis  [mm]

    8.  fM3Trans: continuous  # 3rd root of third moment along minor axis  [mm]

    9.  fAlpha:   continuous  # angle of major axis with vector to origin [deg]
    
   10.  fDist:    continuous  # distance from origin to center of ellipse [mm]

   11.  class:    g,h         # gamma (signal), hadron (background)
```

In [3]:
cols = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]
df = pd.read_csv("./magic-dataset/magic04.data", names=cols)
df.head(10)


Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g
5,51.624,21.1502,2.9085,0.242,0.134,50.8761,43.1887,9.8145,3.613,238.098,g
6,48.2468,17.3565,3.0332,0.2529,0.1515,8.573,38.0957,10.5868,4.792,219.087,g
7,26.7897,13.7595,2.5521,0.4236,0.2174,29.6339,20.456,-2.9292,0.812,237.134,g
8,96.2327,46.5165,4.154,0.0779,0.039,110.355,85.0486,43.1844,4.854,248.226,g
9,46.7619,15.1993,2.5786,0.3377,0.1913,24.7548,43.8771,-6.6812,7.875,102.251,g


In [5]:
df["class"].unique()

array(['g', 'h'], dtype=object)

In [7]:
# creates a boolean mask where it checks if each element in the "class" column
# is equal to "g" and then converts the resulting boolean values to integers
# (0 for False, 1 for True).
df["class"] == (df["class"] == "g").astype(int)

0        False
1        False
2        False
3        False
4        False
         ...  
19015    False
19016    False
19017    False
19018    False
19019    False
Name: class, Length: 19020, dtype: bool

In [8]:
#  DataFrame based on the "class" column being equal to "g" or "h"
filtered_df = df[df["class"].isin(["g", "h"])]
print(filtered_df)

        fLength    fWidth   fSize   fConc  fConc1     fAsym   fM3Long  \
0       28.7967   16.0021  2.6449  0.3918  0.1982   27.7004   22.0110   
1       31.6036   11.7235  2.5185  0.5303  0.3773   26.2722   23.8238   
2      162.0520  136.0310  4.0612  0.0374  0.0187  116.7410  -64.8580   
3       23.8172    9.5728  2.3385  0.6147  0.3922   27.2107   -6.4633   
4       75.1362   30.9205  3.1611  0.3168  0.1832   -5.5277   28.5525   
...         ...       ...     ...     ...     ...       ...       ...   
19015   21.3846   10.9170  2.6161  0.5857  0.3934   15.2618   11.5245   
19016   28.9452    6.7020  2.2672  0.5351  0.2784   37.0816   13.1853   
19017   75.4455   47.5305  3.4483  0.1417  0.0549   -9.3561   41.0562   
19018  120.5135   76.9018  3.9939  0.0944  0.0683    5.8043  -93.5224   
19019  187.1814   53.0014  3.2093  0.2876  0.1539 -167.3125 -168.4558   

       fM3Trans   fAlpha     fDist class  
0       -8.2027  40.0920   81.8828     g  
1       -9.9574   6.3609  205.2610   

In [9]:
# Get the unique values in the "class" column after this filtering
unique_classes = filtered_df["class"].unique()
print(unique_classes)

['g' 'h']
