In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import scipy as sc
import keras

In [2]:
df_std = pd.read_csv('data/df_std.csv', index_col=0)
df_std.head()

Unnamed: 0,Rbin,std,v_index,npt,v
0,67600,374.910225,0,2.0,0.7
1,67100,265.60146,0,2.0,0.7
2,66900,,0,1.0,0.7
3,67400,1161.74882,0,2.0,0.7
4,67000,,0,1.0,0.7


In [3]:
# Plot R_delta_std vs R_bin
fig = go.Figure()

for val in df_std['v'].unique():
    data = df_std.loc[df_std['v']==val]
    fig.add_trace(go.Scatter(y = data['npt'],
                             x = data['Rbin'],
                             mode='markers',
                             name=f'{val}V'))
fig.update_layout(
    title="\u0394R\u03C3 vs Rbin",
    xaxis_title="Rbin",
    yaxis_title="\u0394R\u03C3"
)

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras import Sequential
from xgboost import XGBRegressor

In [9]:
data_std = df_std.dropna()
data_std.head()

Unnamed: 0,Rbin,std,v_index,npt,v
0,67600,374.910225,0,2.0,0.7
1,67100,265.60146,0,2.0,0.7
3,67400,1161.74882,0,2.0,0.7
6,68600,335.748663,0,2.0,0.7
7,69000,568.099532,0,2.0,0.7


In [10]:
# data_mean = data_std.astype(float)
# X_train, X_test, Y_train, Y_test = train_test_split(data_std[['Rbin', 'v', 'npt']],
#                                                     data_std['std'], test_size=0.2, random_state=42)

In [22]:
vval = 1.5
# Prepare data of R_delta_std for training and testing
data_std = data_std.astype(float)
X_train = data_std.loc[(data_std['v']<vval), ['Rbin', 'v', 'npt']]
X_test = data_std.loc[(data_std['v']>=vval), ['Rbin', 'v', 'npt']]
Y_train = data_std.loc[(data_std['v']<vval), 'std']
Y_test = data_std.loc[(data_std['v']>=vval), 'std']

In [23]:
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)

In [87]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

scaler = StandardScaler()
pf = PolynomialFeatures(degree=2)

x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)
X_test_scaled = scaler.transform(X_test)

x_train_pf = pf.fit_transform(x_train_scaled)
x_val_pf = pf.transform(x_val_scaled)
X_test_pf = pf.transform(X_test_scaled)

In [88]:
# # Model for R_delta_std
# model = XGBRegressor(n_estimators=1000, max_depth=9, eta=0.02, subsample=0.2,
#                      eval_metric="logloss", early_stopping_rounds=20)
# model.fit(x_train_pf, y_train, eval_set=[(x_val_pf, y_val)], verbose=True)
# y_val_pred = model.predict(x_val_pf)

In [115]:
# Weighted Linear Regression
model = LinearRegression()

model.fit(x_train_pf, y_train)
predictions = model.predict(x_train_pf)
mean_err = abs(y_train - predictions.mean())

weights = np.log(x_train['npt'])
# display(err_var)

model = LinearRegression().fit(x_train_pf, y_train, sample_weight=weights)
weights

2518    4.110874
6398    1.098612
6214    0.693147
4080    4.219508
539     3.258097
          ...   
4197    4.262680
5786    2.079442
5821    2.564949
5987    4.158883
955     1.386294
Name: npt, Length: 5112, dtype: float64

In [None]:
# model = LinearRegression()
# model.fit(x_train_pf, y_train)

In [116]:
y_val_pred = model.predict(x_val_pf)

In [117]:
# Model scores
mse = mean_squared_error(y_val, y_val_pred)
r_score = model.score(x_val_pf, y_val)
print(f"mse: {mse}")
print(f"r2: {r_score}")

mse: 280657.2338271897
r2: 0.4049080504754362


In [118]:
# Test score
y_test_pred = model.predict(X_test_pf)
print(f"r2: {model.score(X_test_pf, Y_test)}")

r2: 0.22704211575329358


In [119]:
# fig = go.Figure()
# fig.add_trace(go.Scatter(y = Y_test,
#                          x = X_test['Rbin'],
#                          marker=dict(color='blue'),
#                          mode='markers',
#                          name=f'{X_test['v'].mean()}V'
#                          ))
# fig.add_trace(go.Scatter(y = y_test_pred,
#                          x = X_test['Rbin'],
#                          line=dict(color='red'),
#                          mode='markers',
#                          name='Prediction'
#                          ))
# fig.update_layout(
# title="\u0394R\u03C3 vs Rbin",
# xaxis_title="Rbin",
# yaxis_title="\u0394R\u03C3"
# )

In [120]:
plot_df = pd.DataFrame()
plot_df['v'] = X_test['v']
plot_df['Rbin'] = X_test['Rbin']
plot_df['Y_test'] = Y_test
plot_df['ypred'] = y_test_pred
plot_df = plot_df.sort_values('v')
plot_df.head()

Unnamed: 0,v,Rbin,Y_test,ypred
7126,1.5,117200.0,1055.798813,2251.165231
7540,1.5,187200.0,4341.889753,3053.114692
7541,1.5,184400.0,3493.149812,3042.551373
7542,1.5,186000.0,2947.23771,3115.316476
7543,1.5,186700.0,3664.463238,3050.872814


In [121]:
colours = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52',
           '#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52'
           ]
fig = go.Figure()

for val, col in zip(plot_df['v'].unique(), colours):
    data = plot_df.loc[plot_df['v']==val]
    fig.add_trace(go.Scatter(y = data['Y_test'],
                             x = data['Rbin'],
                             marker=dict(color=col),
                             mode='markers',
                             name=f'{val}V'
                             ))
    fig.add_trace(go.Scatter(y = data['ypred'],
                             x = data['Rbin'],
                             line=dict(color='red'),
                             mode='markers',
                             name=f'{val}V'
                             ))
fig.update_layout(
    title="\u0394R\u03C3 vs Rbin",
    xaxis_title="Rbin",
    yaxis_title="\u0394R\u03C3"
)