# Imports

In [61]:
import csv
import numpy as np
import matplotlib.pyplot as plt
import sklearn.linear_model

## Description

#### Experimental description
- Basic information about the data set can be found here: [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/MiniBooNE+particle+identification#)
- Data is taken from the MiniBooNE experiment at Fermi lab
- A beam of neutrinos is fired at a neutrino detector
- Beam primarily consists of muon neutrinos
- Theory: Neutrinos can oscillate flavor e.g. muon to electron neutrino: $\nu_{\mu}\rightarrow\nu_{e}$
- If MiniBooNE detects an excess of electron neutrinos, the theory is supported
- Our task is to train a machine learning model to predict the type of a neutrino based on its detection signature

#### Classification problems
- Classification tasks consist of N samples, each with K features and M categorizations
- For this data set, M = 2
    - This is binary classification
    - Usually set $M\epsilon\{0,1\}$ (usually interpreted as negative or positive, false or true, noise or signal)
- M = 0:
    - Muon neutrino detection, which is considered **noise**
- M = 1:
    - Electron neutrino detection, **signal**


## Fix file

In [2]:
input_file_path = './data/particle/MiniBooNE_PID.txt'
output_file_path = './data/particle/MiniBooNE_PID_cleaned.txt'

In [29]:
num_0 = 36499    # Number of electron neutrinos (signal)
num_1 = 93565    # Number of muon neutrinos (noise)
 
# Open input file handle & reader
with open(input_file_path, 'r') as input_file_handle:
    input_reader = csv.reader(input_file_handle, delimiter = ' ')
    
    # Open output file handle & writer
    with open(output_file_path, 'w') as output_file_handle:
        output_writer = csv.writer(output_file_handle, delimiter = ' ')
        
        next(input_reader)    # Skip first line
        
        # Loop over input file rows
        i = 0
        for row in input_reader:
            
            new_row = [ele for ele in row if ele != '']    # Skip the spaces
            
            
            
            if i < num_0:
                # Electron neutrino
                new_row += ['1']
            else:
                # Muon neutrino
                new_row += ['0']
            
            # Write the fixed row to the new file
            output_writer.writerow(new_row)
            
            i = i + 1

## Load data

In [37]:
file_path = './data/particle/MiniBooNE_PID_cleaned.txt'
full_data = np.genfromtxt(file_path, delimiter=' ')    # Nifty numpy function to load csv directly into numpy array
np.random.shuffle(full_data)    # Scramble the array so we can't cheat :D

In [40]:
print(full_data)

[[   4.63185     2.60109    97.0097   ...,    3.11163     0.265011    0.      ]
 [   4.6394      0.666682   55.845    ...,    2.70267     0.245864    0.      ]
 [   5.93574     3.08672    51.7441   ...,    3.31021     0.323669    0.      ]
 ..., 
 [   7.66781     1.92104   124.429    ...,    4.17329     0.219627    0.      ]
 [   4.81791     1.89325   102.493    ...,    2.6953      0.203304    0.      ]
 [   6.51519     3.21408    25.0301   ...,    3.22264     0.166939    0.      ]]


## Exploratory data analysis

##### 1. How many total particle detections are present in the data set?
- Assign a new variable N to this value

In [6]:
######################
# Put solution here! #
######################

N = full_data.shape[0]
print(N)

130064


##### 2. How many features are in the data set? 
- Assign a new variable K to this value
- Hint: Remember that the full data loaded also includes the classification variables $y$ which should **not** be included in the total variable count!

In [83]:
######################
# Put solution here! #
######################

K = full_data.shape[1] - 1
print(K)

50


##### 3. How many of the neutrinos are electron flavored (1)? Muon flavored (0)?
- Assign new variables N_1 and N_0

In [11]:
print(len(full_data[:,-1] == 1))

130064


In [36]:
N_1 = len(np.where(full_data[:,-1] == 1)[0])
N_0 = len(np.where(full_data[:,-1] == 0)[0])

##### Train

In [84]:
X_train_data = full_data[:int(3/4.*N),:-1]
y_train_data = full_data[:int(3/4.*N),-1]

print(X_train_data.shape, y_train_data.shape)

X_test_data = full_data[int(3/4.*N):,:-1]
y_test_data = full_data[int(3/4.*N):,-1]

print(X_test_data.shape, y_test_data.shape)

(97548, 50) (97548,)
(32516, 50) (32516,)


In [74]:
model = sklearn.linear_model.LogisticRegression()

In [75]:
model.fit(X_train_data, y_train_data)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [78]:
model.score(X_test_data, y_test_data)

0.89180711034567595