<a href="https://colab.research.google.com/github/willrleao/Machine-Learning-for-Everybody-Full-Course-willrleao/blob/main/fcc_MAGIC_example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [101]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

### Dataset:
Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [ https://archive.ics.uci.edu/ml/datasets/magic+gamma+telescope ]. Irvine, CA: University of California, School of Information and Computer Science.

Donated by:
P. Savicky
Institute of Computer Science, AS of CR
Czech Republic
savicky '@' cs.cas.cz

In [129]:
cols = ["fLength", "fWidth", "fSize", "fConc", "fConc1", "fAsym", "fM3Long", "fM3Trans", "fAlpha", "fDist", "class"]
df = pd.read_csv("https://raw.githubusercontent.com/willrleao/Machine-Learning-for-Everybody-Full-Course-willrleao/main/magic04.data", names=cols)
df.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,g
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,g


In [130]:
df["class"] = (df["class"] == "g").astype(int)

In [131]:
df.head()

Unnamed: 0,fLength,fWidth,fSize,fConc,fConc1,fAsym,fM3Long,fM3Trans,fAlpha,fDist,class
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.011,-8.2027,40.092,81.8828,1
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.261,1
2,162.052,136.031,4.0612,0.0374,0.0187,116.741,-64.858,-45.216,76.96,256.788,1
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.449,116.737,1
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.648,356.462,1


In [None]:
for label in cols[:-1]:
  plt.hist(df[df["class"]==1][label], color='blue', label='gamma', alpha=0.7, density=True)
  plt.hist(df[df["class"]==0][label], color='red', label='hadron', alpha=0.7, density=True)
  plt.title(label)
  plt.ylabel("Probability")
  plt.xlabel(label)
  plt.legend()
  plt.show()

#Train, validation, test datasets

In [133]:
train, valid, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))]) 
# Os dados que iram ser pegos para treinamento, validação e testo, estão na faixa dos 60% - 80% de 100% do dataset. 
# Dentro dessa faixa, irá ser divido 60% para treino, 20% para validação e 20% para teste.

In [134]:
#@title padronização, oversampling : return data, x, y
def scale_dataset(dataframe, oversample=False):
  X = dataframe[dataframe.columns[:-1]].values
  y = dataframe[dataframe.columns[-1]].values
  # X pega todas as colunas com as linhas exceto a ultima
  # y pega somente a ultima coluna com as linhas

  # link: ENTENDENDO DE VEZ A DIFERENÇA ENTRE NORMALIZAÇÃO E PADRONIZAÇÃO DOS DADOS -> https://www.youtube.com/watch?v=-L8MacYRWGM
  # link: Padronização vs. Normalização  -> https://www.linkedin.com/pulse/padroniza%C3%A7%C3%A3o-vs-normaliza%C3%A7%C3%A3o-jose-r-f-junior/?originalSubdomain=pt
  scaler = StandardScaler() # Chama o metodo de padronização.  
  X = scaler.fit_transform(X) # treino : fit() calcula a média e o desvio padrão dos dados de treinamento Formula: x_scaled = (x - mean(x)) / std(x) -> media = 0 e desvio padrão = 1.
                              # teste : transform() é usado para aplicar a mesma transformação aos dados de teste, usando os mesmos valores de média e desvio padrão.

  if oversample: # Estrutura de condição para saber se as amostras contem oversample: Desbalanceamento de classes em um conjunto de dados pode causar viés no modelo.
    ros = RandomOverSampler() # Chama o metodo gerador de novas amostras sintéticas.
    X, y = ros.fit_resample(X, y) # O método calcula os parâmetros de oversampling com base nos dados de entrada e, em seguida, transforma esses dados, gerando novas amostras sintéticas para a classe minoritária até que as duas classes tenham o mesmo número de amostras.

  data = np.hstack((X, np.reshape(y, (-1, 1)))) 

  return data, X, y # train, X_train, y_train
                    # valid, X_valid, y_valid
                    # test, X_test, y_test

In [135]:
train['class'].value_counts() # sem oversampler

1    7345
0    4067
Name: class, dtype: int64

In [136]:
train, X_train, y_train = scale_dataset(train, oversample=True) #oversample esta sendo aplicado nas vars de treino pq são elas que vão para o modelo
valid, X_valid, y_valid = scale_dataset(valid, oversample=False) #não necessário oversample
test, X_test, y_test = scale_dataset(test, oversample=False) #não necessário oversample

In [137]:
y_train,np.bincount(y_train) # com oversampler

(array([0, 0, 1, ..., 0, 0, 0]), array([7345, 7345]))

# kNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

In [None]:
y_pred = knn_model.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb_model = GaussianNB()
nb_model = nb_model.fit(X_train, y_train)

In [None]:
y_pred = nb_model.predict(X_test)
print(classification_report(y_test, y_pred))

# Log Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lg_model = LogisticRegression()
lg_model = lg_model.fit(X_train, y_train)

In [None]:
y_pred = lg_model.predict(X_test)
print(classification_report(y_test, y_pred))

# SVM

In [None]:
from sklearn.svm import SVC

In [None]:
svm_model = SVC()
svm_model = svm_model.fit(X_train, y_train)

In [None]:
y_pred = svm_model.predict(X_test)
print(classification_report(y_test, y_pred))

# Neural Net

In [None]:
import tensorflow as tf

In [None]:
def plot_history(history):
  fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
  ax1.plot(history.history['loss'], label='loss')
  ax1.plot(history.history['val_loss'], label='val_loss')
  ax1.set_xlabel('Epoch')
  ax1.set_ylabel('Binary crossentropy')
  ax1.grid(True)

  ax2.plot(history.history['accuracy'], label='accuracy')
  ax2.plot(history.history['val_accuracy'], label='val_accuracy')
  ax2.set_xlabel('Epoch')
  ax2.set_ylabel('Accuracy')
  ax2.grid(True)

  plt.show()

In [None]:
def train_model(X_train, y_train, num_nodes, dropout_prob, lr, batch_size, epochs):
  nn_model = tf.keras.Sequential([
      tf.keras.layers.Dense(num_nodes, activation='relu', input_shape=(10,)),
      tf.keras.layers.Dropout(dropout_prob),
      tf.keras.layers.Dense(num_nodes, activation='relu'),
      tf.keras.layers.Dropout(dropout_prob),
      tf.keras.layers.Dense(1, activation='sigmoid')
  ])

  nn_model.compile(optimizer=tf.keras.optimizers.Adam(lr), loss='binary_crossentropy',
                  metrics=['accuracy'])
  history = nn_model.fit(
    X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=0
  )

  return nn_model, history

In [None]:
least_val_loss = float('inf')
least_loss_model = None
epochs=100
for num_nodes in [16, 32, 64]:
  for dropout_prob in[0, 0.2]:
    for lr in [0.01, 0.005, 0.001]:
      for batch_size in [32, 64, 128]:
        print(f"{num_nodes} nodes, dropout {dropout_prob}, lr {lr}, batch size {batch_size}")
        model, history = train_model(X_train, y_train, num_nodes, dropout_prob, lr, batch_size, epochs)
        plot_history(history)
        val_loss = model.evaluate(X_valid, y_valid)[0]
        if val_loss < least_val_loss:
          least_val_loss = val_loss
          least_loss_model = model

In [None]:
y_pred = least_loss_model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int).reshape(-1,)

In [None]:
print(classification_report(y_test, y_pred))