In [22]:
pip install -r requirements.txt

Collecting imblearn (from -r requirements.txt (line 7))
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn->-r requirements.txt (line 7))
  Downloading imbalanced_learn-0.14.0-py3-none-any.whl.metadata (8.8 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.14.0-py3-none-any.whl (239 kB)
   ---------------------------------------- 0.0/240.0 kB ? eta -:--:--
   ---------------------------------------- 0.0/240.0 kB ? eta -:--:--
   - -------------------------------------- 10.2/240.0 kB ? eta -:--:--
   ------------- ------------------------- 81.9/240.0 kB 919.0 kB/s eta 0:00:01
   ---------------------------------------  235.5/240.0 kB 2.1 MB/s eta 0:00:01
   ---------------------------------------- 240.0/240.0 kB 1.6 MB/s eta 0:00:00
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.14.0 imblearn-0.0
Note: you may need to restart the kerne


[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: C:\Users\VeeneetKumar\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

Dataset:
Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.

Donated by: P. Savicky Institute of Computer Science, AS of CR Czech Republic savicky '@' cs.cas.cz

In [4]:
from pathlib import Path
from zipfile import ZipFile

zip_path = Path(r"Magic-dataset-24-sep-25/magic+gamma+telescope.zip")
out_dir = zip_path.parent / "magic_extracted"

out_dir.mkdir(parents=True, exist_ok=True)
with ZipFile(zip_path, "r") as zf:
    zf.extractall(out_dir)

print("Extracted to:", out_dir)

Extracted to: Magic-dataset-24-sep-25\magic_extracted


In [5]:
cols = [
    "fLength", "fWidth", "fSize", "fConc", "fConc1",
    "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist",
    "class"
]
df = pd.read_csv("Magic-dataset-24-sep-25/magic_extracted/magic04.data",header=None, names=cols)
df.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


In [6]:
df['class'] = (df['class'] == 'g').astype(int)

In [7]:
df.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,1
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,1
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,1
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,1
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,1


In [None]:
for lable in cols[:-1]:
    plt.figure(figsize=(10, 6))
    plt.hist(df[df['class'] == 1][lable], bins=20, alpha=0.5, label='gamma', color='blue')
    plt.hist(df[df['class'] == 0][lable], bins=20, alpha=0.5, label='hadron', color='red')
    plt.title(f'Histogram of {lable}')
    plt.xlabel(lable)
    plt.ylabel('Probability')  
    plt.legend()
    plt.show()

Train, validation, test datasets

In [10]:
train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])

In [11]:
def scale_dataset(dataframe, oversample=False):
    x = dataframe[dataframe.columns[:-1]].values
    y = dataframe[dataframe.columns[-1]].values

    scaler = StandardScaler()
    x = scaler.fit_transform(x)

    if oversample:
        ros = RandomOverSampler()
        x, y = ros.fit_resample(x, y)

    data = np.hstack((x, np.reshape(y, (-1, 1))))
    return data, x, y

In [12]:
print(len(train[train['class'] == 1])) # gamma
print(len(train[train['class'] == 0])) # hadron

7378
4034


In [13]:
train, x_train, y_train = scale_dataset(train, oversample=True)
valid, x_valid, y_valid = scale_dataset(valid, oversample=False)
test, x_test, y_test = scale_dataset(test, oversample=False)

In [14]:
len(y_train)

14756

In [15]:
sum(y_train == 1)

np.int64(7378)

In [16]:
sum(y_train == 0)

np.int64(7378)

KNN


In [23]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report


In [30]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(x_train, y_train)

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [31]:
y_pred = knn_model.predict(x_test)

In [32]:
y_pred

array([1, 1, 1, ..., 1, 1, 0], shape=(3804,))

In [33]:
y_test

array([0, 1, 1, ..., 1, 1, 1], shape=(3804,))

In [34]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.75      0.74      1299
           1       0.87      0.87      0.87      2505

    accuracy                           0.82      3804
   macro avg       0.80      0.81      0.80      3804
weighted avg       0.82      0.82      0.82      3804



Naive Bayes

In [35]:
from sklearn.naive_bayes import GaussianNB


In [36]:
nb_model = GaussianNB()
nb_model.fit(x_train, y_train)

0,1,2
,priors,
,var_smoothing,1e-09


In [37]:
y_nbpred = nb_model.predict(x_test)

In [38]:
print(classification_report(y_test, y_nbpred))

              precision    recall  f1-score   support

           0       0.68      0.40      0.51      1299
           1       0.74      0.90      0.82      2505

    accuracy                           0.73      3804
   macro avg       0.71      0.65      0.66      3804
weighted avg       0.72      0.73      0.71      3804



Logistic Regression

In [39]:
from sklearn.linear_model import LogisticRegression

In [40]:
lg_model = LogisticRegression()
lg_model.fit(x_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [41]:
y_lgpred = lg_model.predict(x_test)

In [42]:
print(classification_report(y_test, y_lgpred))

              precision    recall  f1-score   support

           0       0.68      0.73      0.70      1299
           1       0.85      0.82      0.84      2505

    accuracy                           0.79      3804
   macro avg       0.77      0.78      0.77      3804
weighted avg       0.80      0.79      0.79      3804



SVM

In [44]:
from sklearn.svm import SVC

In [45]:
svc_model = SVC()
svc_model.fit(x_train, y_train)

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [46]:
y_svcpred = svc_model.predict(x_test)

In [47]:
print(classification_report(y_test, y_svcpred))

              precision    recall  f1-score   support

           0       0.81      0.80      0.80      1299
           1       0.90      0.90      0.90      2505

    accuracy                           0.87      3804
   macro avg       0.85      0.85      0.85      3804
weighted avg       0.87      0.87      0.87      3804

