# Regression Algorithms for Muon Data

We will first check some linear regression algorithms (using the negative mean square error scorer).

### Linear and non Linear Regression

The ML algorithms used for linear regression are: linear regression, Lasso and the ElasticNet. The non linear regression algorithms are: KNeighbotsRegressor, DecisionTreeRegressor, SVR 

In [None]:
from pandas import read_csv
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import mean_squared_error
np.random.seed(42) #Independent from run

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.utils import shuffle
import matplotlib.pyplot as plt

import os

In [None]:
from sklearn.model_selection import train_test_split

Load the csv into a pandas dataframe

In [None]:
dataframe = pd.read_csv('../MuonPOGAnalysisTemplate/output/bxcut_full_muon.csv')
array = dataframe.values
dataframe

The first option is divide the train/set in due different datasets.

In [None]:
dataframe["1dtPrimitive.phiB"] = dataframe["1dtPrimitive.phiB"]/512.
dataframe["2dtPrimitive.phiB"] = dataframe["2dtPrimitive.phiB"]/512.
dataframe["3dtPrimitive.phiB"] = dataframe["3dtPrimitive.phiB"]/512.
dataframe["4dtPrimitive.phiB"] = dataframe["4dtPrimitive.phiB"]/512.

In [None]:
def preprocess_features(muon_dataframe):
  """Prepares input features from Muon data set.

  Args:
    muon_dataframe: A Pandas DataFrame expected to contain data
      from muon simulations
  Returns:
    A DataFrame that contains the features to be used for the model.
  """
  selected_features = muon_dataframe[
[#'Event',
 'n_Primitive',
 '1dtPrimitive.id_r',
 '2dtPrimitive.id_r',
 '3dtPrimitive.id_r',
 '4dtPrimitive.id_r',
 '1dtPrimitive.id_eta',
 '3dtPrimitive.id_eta',
 '4dtPrimitive.id_eta',
 '1dtPrimitive.id_phi',
 '2dtPrimitive.id_phi',
 '3dtPrimitive.id_phi',
 '4dtPrimitive.id_phi',
 '1dtPrimitive.phiB',
 '2dtPrimitive.phiB',
 '3dtPrimitive.phiB',
 '4dtPrimitive.phiB',
 '1dtPrimitive.quality',
 '2dtPrimitive.quality',
 '3dtPrimitive.quality',
 '4dtPrimitive.quality',
 'delta_phi12',
 'delta_phi13',
 'delta_phi14',
 'delta_phi23',
 'delta_phi24',
 'delta_phi34'
  ]]
  processed_features = selected_features.copy()
  return processed_features.astype(np.float32)

In [None]:
def preprocess_targets(muon_dataframe):
  """
  Prepares target features (i.e., labels) from muon data set.

  Args:
    muon_dataframe: A Pandas DataFrame expected to contain data
      from the Muon data set.
  Returns:
    A DataFrame that contains the target feature.
  """
  output_targets = pd.DataFrame()
  output_targets["genParticle.pt"] = muon_dataframe["genParticle.pt"]/200
  return output_targets.astype(np.float32)

In [None]:
X = preprocess_features(dataframe)
Y = preprocess_targets(dataframe)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [None]:
X.loc[X["1dtPrimitive.quality"] < 4, '1dtPrimitive.quality'] = 0.0
X.loc[X["1dtPrimitive.quality"] >= 4, '1dtPrimitive.quality'] = 1.0
X.loc[X["2dtPrimitive.quality"] < 4, '2dtPrimitive.quality'] = 0.0
X.loc[X["2dtPrimitive.quality"] >= 4, '2dtPrimitive.quality'] = 1.0
X.loc[X["3dtPrimitive.quality"] < 4, '3dtPrimitive.quality'] = 0.0
X.loc[X["3dtPrimitive.quality"] >= 4, '3dtPrimitive.quality'] = 1.0
X.loc[X["4dtPrimitive.quality"] < 4, '4dtPrimitive.quality'] = 0.0
X.loc[X["4dtPrimitive.quality"] >= 4, '4dtPrimitive.quality'] = 1.0

In [None]:
lm = LinearRegression()
model = lm.fit(X_train, y_train)
predictions = lm.predict(X_test)

## The line / model
fig, ax = plt.subplots()
ax.scatter(y_test, predictions)
ax.plot([Y.min(), Y.max()], [Y.min(), Y.max()], 'k--', lw=4)
ax.set_xlabel("True Values")
ax.set_ylabel("Predictions")
plt.show()
del ax

The other option is to use a Kfold for cross_validation

In [None]:
#prepare models
models = []
models.append(( 'LR' , LinearRegression()))
models.append(( 'LAR' , Lasso()))
models.append(( 'RIR' , Ridge()))
models.append(( 'EN' , ElasticNet()))
models.append(( 'KNR' , KNeighborsRegressor()))
models.append(( 'DTR' , DecisionTreeRegressor()))

#evaluate each model in turn
values = [-0.170369,-0.109598,-0.105400,-0.108883]
results = []
names = []
scoring = 'neg_mean_squared_error'
for name,model in models:
    kfold = KFold(n_splits=15, random_state=7)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
    results.append(-np.sqrt(np.abs(cv_results)))
    names.append(name)
    msg = "%s: %f (%f)" % (name, np.sqrt((-1)*cv_results.mean()), np.sqrt(cv_results.std()))
    print(msg)
    predicted = cross_val_predict(model, X,Y, cv=kfold)
    fig, ax = plt.subplots()
    ax.scatter(Y, predicted, edgecolors=(0, 0, 0))
    ax.plot([Y.min(), Y.max()], [Y.min(), Y.max()], 'k--', lw=4)
    ax.set_xlabel('Measured')
    ax.set_ylabel('Predicted')
    plt.show()
    del ax
names.append('ANN')
results.append(values)
    # boxplot algorithm comparison
fig = plt.figure()
#fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
ax.set_xlabel('Algorithm')
ax.set_ylabel('RMSE')
plt.tight_layout()
plt.savefig('output.png',format='png',dpi=800)
#plt.show()
