In [1]:

!pip install metric-learn
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from metric_learn import MMC
from sklearn.neighbors import KDTree
from sklearn.preprocessing import MinMaxScaler
import time
import ast
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Collecting metric-learn
  Downloading metric_learn-0.7.0-py2.py3-none-any.whl.metadata (5.2 kB)
Downloading metric_learn-0.7.0-py2.py3-none-any.whl (67 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: metric-learn
Successfully installed metric-learn-0.7.0


In [None]:
#MONDAY AT HOSTEL

# Record start time
start_time = time.time()

# Define the file path of the CSV file containing the input data
input_file_path = '/kaggle/input/10k-3lakh-rows/RecoOutPileup_uniform_1_10000_16666_ns.root_recohits.csv'

# Read the original CSV file with all columns
original_data = pd.read_csv(input_file_path)

# Select only the desired columns (x, y, z, t)
input_data_1 = original_data[['x', 'y', 'z', 't']]

# Scale the data
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(input_data_1)

# Load the trained MMC model from the pickle file
model_file_path = '/kaggle/input/mmc-scaled-model-pkl/mmc_model_scaled.pkl'
with open(model_file_path, 'rb') as file:
    trained_mmc_model = pickle.load(file)

# Transform the input data using the trained MMC model
X_transformed = trained_mmc_model.transform(scaled_data)

# KDTree
tree = KDTree(X_transformed, leaf_size=2)

# Find neighbors
neighbors_list = []
for i in range(len(X_transformed)):
    neighbors = tree.query_radius(X_transformed[i].reshape(1, -1), r=0.01)[0]
    neighbors_list.append(neighbors)

# Initialize lists to store data for CSV
point1_list = []
point2_list = []
delta_z_values = []
delta_t_values = []
labels = []

# Randomly select pairs of point indices from the embedded space
random_indices = np.random.choice(len(X_transformed), size=375913, replace=False)
for i in random_indices:
    j = np.random.choice(neighbors_list[i])
    if i != j:
        point1_csv = X_transformed[i]  # Store the actual value of point 1
        point2_csv = X_transformed[j]  # Store the actual value of point 2
        
        # Extract z and t values for point 1 and point 2
        z1 = original_data.iloc[i]['z']
        z2 = original_data.iloc[j]['z']
        t1 = original_data.iloc[i]['t']
        t2 = original_data.iloc[j]['t']
        
        # Calculate delta z and delta t
        delta_z = abs(z2 - z1)
        delta_t = abs(t2 - t1)
        
        # Normalize delta z and delta t
        max_delta_z = original_data['z'].max()
        max_delta_t = original_data['t'].max()
        
        print("Max delta z value:", max_delta_z)
        print("Max delta t value:", max_delta_t)
        
        normalized_delta_z = delta_z / max_delta_z
        normalized_delta_t = delta_t / max_delta_t
        
        # Check if muonid is -999 for either point 1 or point 2
        if original_data.iloc[i]['muonid'] == -999 or original_data.iloc[j]['muonid'] == -999:
            # If muonid is -999 for either point 1 or point 2, assign label 0
            label = 0
        else:
            # Assign label based on muon IDs
            if original_data.iloc[i]['muonid'] == original_data.iloc[j]['muonid']:
                label = 1
            else:
                label = 0
        
        # Append data to lists
        point1_list.append(point1_csv)
        point2_list.append(point2_csv)
        delta_z_values.append(normalized_delta_z)
        delta_t_values.append(normalized_delta_t)
        labels.append(label)

# Create DataFrame from the lists
data = pd.DataFrame({
    'point1': point1_list,
    'point2': point2_list,
    'normalized_delta_z': delta_z_values,
    'normalized_delta_t': delta_t_values,
    'label': labels
})

# Save DataFrame to CSV
data.to_csv('p1_p2_normalized_dz_dt_label.csv', index=False)

# Calculate execution time
execution_time = time.time() - start_time
print("Execution time:", execution_time, "seconds")


In [4]:
# Load the CSV file containing point 1, point 2, delz, delt and label
data = pd.read_csv('/kaggle/input/p1-p2-normalized-dz-dt-label/p1_p2_normalized_dz_dt_label.csv')

# Count the number of rows with label 1
num_label_1 = (data['label'] == 1).sum()

# Select rows with label 0
data_label_0 = data[data['label'] == 0]

# Sample the same number of rows with label 0 as there are rows with label 1
data_label_0_sampled = data_label_0.sample(n=num_label_1, random_state=42)

# Concatenate the sampled rows with the rows with label 1
balanced_data = pd.concat([data[data['label'] == 1], data_label_0_sampled])

# Save the balanced data to a new CSV file
balanced_data.to_csv('P1_P2_delz_delt_balanced_data.csv', index=False)

In [3]:
# Load your CSV file into a DataFrame
df = pd.read_csv('/kaggle/input/p1-p2-delz-delt-balanced-data/P1_P2_delz_delt_balanced_data.csv')

# # Remove square brackets and convert string representation to numerical values
# df['point1'] = df['point1'].str.replace('[','').str.replace(']','').str.split().apply(lambda x: [float(i) for i in x])
# df['point2'] = df['point2'].str.replace('[','').str.replace(']','').str.split().apply(lambda x: [float(i) for i in x])

# # Expand the lists into separate columns
# df[['point1_x', 'point1_y', 'point1_z', 'point1_w']] = pd.DataFrame(df['point1'].tolist(), index=df.index)
# df[['point2_x', 'point2_y', 'point2_z', 'point2_w']] = pd.DataFrame(df['point2'].tolist(), index=df.index)

# Select features and target variable
X = df[['normalized_delta_z', 'normalized_delta_t']]
y = df['label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate the Random Forest Classifier model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Save the trained model as a pickle file
model_file_path = 'random_forest_model.pkl'
with open(model_file_path, 'wb') as file:
    pickle.dump(model, file)


# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9514143813431224


In [1]:
import pandas as pd
df = pd.read_csv('/kaggle/input/p1-p2-delz-delt-balanced-data/P1_P2_delz_delt_balanced_data.csv')
df

Unnamed: 0,point1,point2,normalized_delta_z,normalized_delta_t,label
0,[ 0.00000000e+00 1.58390412e-09 -1.42912189e-...,[ 0.00000000e+00 1.42632785e-09 -1.52226668e-...,0.048577,0.000090,1
1,[ 0.00000000e+00 9.54720582e-10 -3.13231902e-...,[ 0.00000000e+00 8.50653766e-10 -3.16910046e-...,0.014416,0.000045,1
2,[ 0.00000000e+00 1.18881360e-09 -3.01737383e-...,[ 0.00000000e+00 1.18872954e-09 -2.99909109e-...,0.008223,0.000000,1
3,[ 0.00000000e+00 1.46466857e-09 -5.41194863e-...,[ 0.00000000e+00 1.53864207e-09 -5.49883296e-...,0.004805,0.000045,1
4,[ 0.00000000e+00 1.70874001e-09 -3.47229801e-...,[ 0.00000000e+00 1.58402017e-09 -3.44027720e-...,0.010572,0.000000,1
...,...,...,...,...,...
115767,[ 0.00000000e+00 7.44388911e-10 -2.46969985e-...,[ 0.00000000e+00 1.61757497e-09 -2.54066730e-...,0.071118,0.000361,0
115768,[ 0.00000000e+00 1.60458108e-09 -2.47747157e-...,[ 0.00000000e+00 1.74989379e-09 -2.44185161e-...,0.020182,0.000090,0
115769,[ 0.00000000e+00 1.73603686e-09 -1.78714268e-...,[ 0.00000000e+00 1.55962921e-09 -1.72207712e-...,0.094183,0.002255,0
115770,[ 0.00000000e+00 8.69509109e-10 -3.88770566e-...,[ 0.00000000e+00 1.86416701e-09 -3.56531680e-...,0.002883,0.000180,0
