In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

# DATASET
# https://archive.ics.uci.edu/ml/datasets/MAGIC+Gamma+Telescope

    1.  fLength:  continuous  # major axis of ellipse [mm]
    2.  fWidth:   continuous  # minor axis of ellipse [mm] 
    3.  fSize:    continuous  # 10-log of sum of content of all pixels [in #phot]
    4.  fConc:    continuous  # ratio of sum of two highest pixels over fSize  [ratio]
    5.  fConc1:   continuous  # ratio of highest pixel over fSize  [ratio]
    6.  fAsym:    continuous  # distance from highest pixel to center, projected onto major axis [mm]
    7.  fM3Long:  continuous  # 3rd root of third moment along major axis  [mm] 
    8.  fM3Trans: continuous  # 3rd root of third moment along minor axis  [mm]
    9.  fAlpha:   continuous  # angle of major axis with vector to origin [deg]
    10.  fDist:    continuous  # distance from origin to center of ellipse [mm]
    11.  class:    g,h         # gamma (signal), hadron (background)

    g = gamma (signal):     12332
    h = hadron (background): 6688

In [None]:
# Making the columns as to be put into the data
cols = ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'class']
# This is for reading the data
df = pd.read_csv("magic04.data", names=cols)
# df.head() # returns the first five things
# display(df)

In [None]:
df["class"].unique() # returns all the unique data in data frame

# now to convert this g,h to 0,1 (in order to understand the df)
df["class"] = (df["class"] == "g").astype(int)
# display(df)

In [None]:
df.head()

In [None]:
for label in cols[:-1]:
    plt.hist(df[df["class"]==1][label], color='blue', label='gamma',  alpha=0.7,density=True)
    plt.hist(df[df["class"]==0][label], color='red', label='hadron',  alpha=0.7,density=True)
    plt.title(label)
    plt.ylabel("Probability")
    plt.xlabel(label)
    plt.legend()
    plt.show()

# Train, Validation, Test Datasets

In [None]:
# Setting up the training data, validation data and the test data
train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])

In [None]:
# Funtion responsible for sacling the dataset
def scale_dataset(dataframe, oversample=False):
    X = dataframe[dataframe.columns[:-1]].values #Upto the last column
    Y = dataframe[dataframe.columns[-1]].values  #Only the last column
    
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    if oversample:
        ros = RandomOverSampler()
        X, Y = ros.fit_resample(X, Y)
    
    data = np.hstack((X, np.reshape(Y, (-1,1))))
    
    return data, X, Y

In [None]:
# print(len(train[train["class"]==1])) # Gamma
# print(len(train[train["class"]==0])) # Hadron

In [None]:
# This will scale the dataset
train, X_train, Y_train = scale_dataset(train, oversample=True)
# Here we are not oversampling validation and test dataset as we need to be sure if our model can be trusted
valid, X_valid, Y_valid = scale_dataset(valid, oversample=False)
test, X_test, Y_test = scale_dataset(test, oversample=False)

In [None]:
sum(Y_test==0) # This shows no of input of a specific type

In [None]:
# kNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [None]:
# Testing/Training the model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, Y_train)

In [None]:
# Actually predicting the data
Y_pred = knn_model.predict(X_test)
print(Y_pred)
print(Y_test)

In [None]:
# printing the classification reports
print(classification_report(Y_test, Y_pred))

In [None]:
# Naive Bayes

In [None]:
# importing naive bayes modules
from sklearn.naive_bayes import GaussianNB

In [None]:
# Initializing and fitting the dataset into nb model
nb_model = GaussianNB()
nb_model.fit(X_train, Y_train)

In [None]:
# Actually predicting the data
Y_pred = nb_model.predict(X_test)
print(Y_pred)
print(Y_test)

In [None]:
# printing the classification reports
print(classification_report(Y_test, Y_pred))

In [None]:
# Logistic Regression

In [None]:
# importing logistic regression
from sklearn.linear_model import LogisticRegression

In [None]:
# Initializing and fitting the dataset into nb model
lg_model = LogisticRegression()
lg_model.fit(X_train, Y_train)

In [None]:
# Actually predicting the data
Y_pred = lg_model.predict(X_test)
print(Y_pred)
print(Y_test)

In [None]:
# printing the classification reports
print(classification_report(Y_test, Y_pred))

In [None]:
# SVM - Support Vector Machine

In [None]:
# importing SVC
from sklearn.svm import SVC

In [None]:
# Initializing and fitting data into the SVC
svm_model = SVC()
svm_model.fit(X_train,Y_train)

In [None]:
# Actually predicting the data
Y_pred =svm_model.predict(X_test)
print(Y_pred)
print(Y_test)

In [None]:
# printing the classification reports
print(classification_report(Y_test, Y_pred))

In [None]:
# Neural Networks - Using TensorFlow

In [None]:
# importing tensorflow
import tensorflow as tf

In [None]:
# summarize history for accuracy
def plot_accuracy(history):
    plt.plot(history.history['accuracy'], label='accuracy')
    plt.plot(history.history['val_accuracy'], label='val_accuracy')
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend()
    plt.grid(True)
    plt.show()

# summarize history for loss
def plot_loss(history):
    plt.plot(history.history['loss'], label='loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
nn_model = tf.keras.Sequential([
    tf.keras.layers.Dense(32, activation='relu', input_shape =(10,)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# here we are using 'sigmoid' activation function in the output layer as we know from the graph of sigmoid
# that our output will be 0 or 1

# in tensorflow we need to compile the tf data and we use can use any optimizer of our choice
nn_model.compile(optimizer = tf.keras.optimizers.Adam(0.001), loss='binary_crossentropy',
                 metrics=['accuracy'])

In [None]:
# Actually this is training the model
# we can set verbose = 0, in order to stop printing during the training
history = nn_model.fit(X_train, Y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0)

In [None]:
plot_loss(history)
plot_accuracy(history)

In [None]:
# we have above trained the model based only on one parameter of nodes
# now we'll see and check it for a number of parameters

def nn_train(X_train, Y_train, num_nodes, dropout_prob, lr, batch_size, epochs):
    nn_model = tf.keras.Sequential([
        tf.keras.layers.Dense(num_nodes, activation='relu', input_shape =(10,)),
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(num_nodes, activation='relu'),
        tf.keras.layers.Dropout(dropout_prob),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    # here we are using 'sigmoid' activation function in the output layer as we know from the graph of sigmoid
    # that our output will be 0 or 1

    # in tensorflow we need to compile the tf data and we use can use any optimizer of our choice
    nn_model.compile(optimizer = tf.keras.optimizers.Adam(lr), loss='binary_crossentropy',
                     metrics=['accuracy'])
    
    # Actually this is training the model
    # we can set verbose = 0, in order to stop printing during the training
    history = nn_model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=0)
    return nn_model, history

In [None]:
# defining a new show plotfunction in order to show all the plots inthe same graph
# summarize history for accuracy
def plot_graph(history):
    fig, (ax1, ax2) = plt.subplots(1,2, figsize=(8,3))
    ax1.plot(history.history['accuracy'], label='accuracy')
    ax1.plot(history.history['val_accuracy'], label='val_accuracy')
    ax1.set_ylabel('accuracy')
    ax1.set_xlabel('epoch')
    ax1.grid(True)
    
    ax2.plot(history.history['loss'], label='loss')
    ax2.plot(history.history['val_loss'], label='val_loss')
    ax2.set_ylabel('loss')
    ax2.set_xlabel('epoch')
    ax2.grid(True)
    
    plt.show()

plot_graph(history)

In [None]:
# we need to record which model has the least loss
least_val_loss = float('inf')
least_loss_model = None

epochs = 100
# setting up the loop to train the data
for num_nodes in [16, 32, 64]:
    for dropout_prob in [0, 0.2]:
        for lr in [0.01, 0.005, 0.001]:
            for batch_size in [32, 64, 128]:
                print(f"{num_nodes} nodes, dropout_prob {dropout_prob}, lr {lr}, batch_size {batch_size}")
                model, history = nn_train(X_train, Y_train, num_nodes, dropout_prob, lr, batch_size, epochs)
                plot_graph(history)
                results = model.evaluate(X_valid, Y_valid)
                val_loss = results[0]  # Extract the loss value from the list
                if val_loss < least_val_loss:
                    least_val_loss = val_loss
                    least_loss_model = model

In [None]:
y_pred = least_loss_model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int).reshape(-1,)

In [None]:
# printing the classification reports
print(classification_report(Y_test, Y_pred))