# Imports

In [29]:
import csv
import numpy as np
import matplotlib.pyplot as plt

## Description

#### Experimental description
- Basic information about the data set can be found here: [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/MiniBooNE+particle+identification#)
- Data is taken from the MiniBooNE experiment at Fermi lab
- A beam of neutrinos is fired at a neutrino detector
- Beam primarily consists of muon neutrinos
- Theory: Neutrinos can oscillate flavor e.g. muon to electron neutrino: $\nu_{\mu}\rightarrow\nu_{e}$
- If MiniBooNE detects an excess of electron neutrinos, the theory is supported
- Our task is to train a machine learning model to predict the type of a neutrino based on its detection signature

#### Classification problems
- Classification tasks consist of N samples, each with K features and M categorizations
- For this data set, M = 2
    - This is binary classification
    - Usually set $M\epsilon\{0,1\}$ (usually interpreted as negative or positive, false or true, noise or signal)
- M = 0:
    - Muon neutrino detection, which is considered **noise**
- M = 1:
    - Electron neutrino detection, **signal**


## Fix file

In [19]:
input_file_path = './data/particle/MiniBooNE_PID.txt'
output_file_path = './data/particle/MiniBooNE_PID_cleaned.txt'

In [50]:
num_0 = 36499    # Number of electron neutrinos (signal)
num_1 = 93565    # Number of muon neutrinos (noise)
 
# Open input file handle & reader
with open(input_file_path, 'r') as input_file_handle:
    input_reader = csv.reader(input_file_handle, delimiter = ' ')
    
    # Open output file handle & writer
    with open(output_file_path, 'w') as output_file_handle:
        output_writer = csv.writer(output_file_handle, delimiter = ' ')
        
        next(input_reader)    # Skip first line
        
        i = 0
        for row in input_reader:
            
            new_row = [ele for ele in row if ele != '']
            
            if i < num_0:
                new_row += ['1']
            else:
                new_row += ['0']
            
            
            output_writer.writerow(new_row)
            
            i = i + 1
        

## Load data

In [51]:
file_path = './data/particle/MiniBooNE_PID_cleaned.txt'
full_data = np.genfromtxt(file_path, delimiter=' ')

In [52]:
print(full_data)

[[  2.59413000e+00   4.68803000e-01   2.06916000e+01 ...,   7.17692000e-02
    2.45996000e-01   1.00000000e+00]
 [  3.86388000e+00   6.45781000e-01   1.81375000e+01 ...,   3.33613000e-01
    2.30621000e-01   1.00000000e+00]
 [  3.38584000e+00   1.19714000e+00   3.60807000e+01 ...,   2.55512000e-01
    1.80901000e-01   1.00000000e+00]
 ..., 
 [  3.10842000e+00   2.17814000e+00   5.63651000e+01 ...,   7.30342000e-01
    1.52876000e-01   0.00000000e+00]
 [  5.44560000e+00   1.84570000e+00   1.03463000e+02 ...,   8.19867000e-01
    2.10619000e-01   0.00000000e+00]
 [  4.55062000e+00   1.34174000e+00   8.00887000e+01 ...,   7.42709000e-01
    2.76477000e-01   0.00000000e+00]]


## Exploratory data analysis

##### 1. How many total particle detections are present in the data set?
- Assign a new variable N to this value

In [55]:
######################
# Put solution here! #
######################

N = full_data.shape[0]
print(N)

130064


##### 2. How many features are in the data set? 
- Assign a new variable K to this value
- Hint: Remember that the full data loaded also includes the classification variables $y$ which should **not** be included in the total variable count!

In [54]:
######################
# Put solution here! #
######################

K = full_data.shape[1] - 1

##### 3. How many of the neutrinos are electron flavored (1)? Muon flavored (0)?
- Assign new variables N_1 and N_0

In [47]:
print(len(np.where(full_data[:,-1]==1)))

1


In [49]:
print(np.where(full_data[:,-1] == 1))

(array([     0,      1,      2, ..., 130061, 130062, 130063]),)


### 1. How many total particle detections are present in the data set?

### 2. How many features are in the data set?

### 3. How many of the particles are signal (1)? How many are noise (0)?

In [36]:
print(full_data.shape)

(130064, 51)


# Load data

In [15]:
def get_csv_length(file_handle, file_reader):
    file_handle.seek(0)
    length = sum(1 for row in file_reader)
    file_handle.seek(0)
    return length

In [17]:
file_path = './data/particle/MiniBooNE_PID.txt'
with open(file_path, 'r') as file_handle:
        
    file_reader = csv.reader(file_handle, delimiter = ' ')
    
    length = get_csv_length(file_handle, file_reader)
    
    for i in range(5):
        print(next(reader))
        
        

ValueError: I/O operation on closed file.

In [6]:
print(dir(csv))

['Dialect', 'DictReader', 'DictWriter', 'Error', 'OrderedDict', 'QUOTE_ALL', 'QUOTE_MINIMAL', 'QUOTE_NONE', 'QUOTE_NONNUMERIC', 'Sniffer', 'StringIO', '_Dialect', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', '__version__', 'excel', 'excel_tab', 'field_size_limit', 'get_dialect', 'list_dialects', 're', 'reader', 'register_dialect', 'unix_dialect', 'unregister_dialect', 'writer']
