In [93]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import scipy as sc
import keras
import tensorflow as tf
import statsmodels.api as sm

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neural_network import MLPRegressor
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, PowerTransformer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, MaxAbsScaler
from keras import Sequential
from xgboost import XGBRegressor

In [94]:
df_std = pd.read_csv('data/df_std.csv', index_col=0)
df_std.head()

Unnamed: 0,Rbin,std,v_index,npt,v
0,67600,374.910225,0,2.0,0.7
1,67100,265.60146,0,2.0,0.7
2,66900,,0,1.0,0.7
3,67400,1161.74882,0,2.0,0.7
4,67000,,0,1.0,0.7


In [323]:
# Plot R_delta_std vs R_bin
fig = go.Figure()

# for val in (1, 1.4, 1.8):
for val in df_std['v'].unique():
    data = df_std.loc[df_std['v']==val]
    fig.add_trace(go.Scatter(y = data['std'],
                             x = data['Rbin'],
                             mode='markers',
                             name=f'{val}V'))
fig.update_layout(
    height=600,
    font_size=18,
    title="\u0394R\u03C3 vs Rbin",
    xaxis_title="Rbin (\u03A9)",
    yaxis_title="\u0394R\u03C3 (\u03A9)",
    xaxis_range=[60000,250000],
    yaxis_range=[-1000,10000],
)

In [96]:
data_std = df_std.dropna()
data_std.head()

Unnamed: 0,Rbin,std,v_index,npt,v
0,67600,374.910225,0,2.0,0.7
1,67100,265.60146,0,2.0,0.7
3,67400,1161.74882,0,2.0,0.7
6,68600,335.748663,0,2.0,0.7
7,69000,568.099532,0,2.0,0.7


In [97]:
# data_mean = data_std.astype(float)
# X_train, X_test, Y_train, Y_test = train_test_split(data_std[['Rbin', 'v', 'npt']],
#                                                     data_std['std'], test_size=0.2, random_state=42)

In [238]:
vval = 1.45
# Prepare data of R_delta_std for training and testing
data_std = data_std.astype(float)
X_train = data_std.loc[(data_std['v']<vval), ['Rbin', 'v', 'npt']]
X_test = data_std.loc[(data_std['v']>=vval), ['Rbin', 'v', 'npt']]
Y_train = data_std.loc[(data_std['v']<vval), 'std']
Y_test = data_std.loc[(data_std['v']>=vval), 'std']

In [239]:
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)

In [240]:
pf = Pipeline(steps=[
    ("scaler", StandardScaler()),             # Scale the transformed features
    ("poly1", PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)),  # Apply polynomial transformation
])

ct = ColumnTransformer(transformers=[
    ("num", pf, ["Rbin", "v", "npt"]),
])

In [241]:
x_train_pf = ct.fit_transform(x_train)
x_val_pf = ct.transform(x_val)
X_test_pf = ct.transform(X_test)

In [242]:
# # Model for R_delta_std
# model = XGBRegressor(n_estimators=1000, max_depth=9, eta=0.02, subsample=0.2,
#                      eval_metric="logloss", early_stopping_rounds=20)
# model.fit(x_train_pf, y_train, eval_set=[(x_val_pf, y_val)], verbose=True)
# y_val_pred = model.predict(x_val_pf)

In [314]:
# Weighted Linear Regression
model0 = LinearRegression(fit_intercept=True)
model0.fit(x_train_pf, y_train)
y_train_pred = model0.predict(x_train_pf)
model0.score(x_train_pf, y_train)

residuals = y_train - y_train_pred
residual_variances = np.var(residuals)
weights = 1/abs(residuals)

model = sm.WLS(y_train, x_train_pf, weights=weights)
model = model.fit()
model.rsquared

0.9234069913655102

In [305]:
model = LinearRegression(fit_intercept=True)
model.fit(x_train_pf, y_train)
model.score(x_train_pf, y_train)

0.31641145775180834

In [296]:
model = MLPRegressor(random_state=42, max_iter=200, solver='adam', learning_rate='adaptive', alpha=0.2, early_stopping=True, n_iter_no_change=20)
model = model.fit(x_train_pf, y_train)
model.score(x_train_pf, y_train)


Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet.



0.2241828819123879

In [315]:
# Validation score
y_val_pred = model.predict(x_val_pf)
r2_score(y_val, y_val_pred)

0.32292869905151456

In [316]:
# Test score
y_test_pred = model.predict(X_test_pf)
r2_score(Y_test, y_test_pred)

0.2950509739605035

In [317]:
# fig = go.Figure()
# fig.add_trace(go.Scatter(y = Y_test,
#                          x = X_test['Rbin'],
#                          marker=dict(color='blue'),
#                          mode='markers',
#                          name=f'{X_test['v'].mean()}V'
#                          ))
# fig.add_trace(go.Scatter(y = y_test_pred,
#                          x = X_test['Rbin'],
#                          line=dict(color='red'),
#                          mode='markers',
#                          name='Prediction'
#                          ))
# fig.update_layout(
# title="\u0394R\u03C3 vs Rbin",
# xaxis_title="Rbin",
# yaxis_title="\u0394R\u03C3"
# )

In [318]:
plot_df = pd.DataFrame()
# test data
plot_df['v'] = X_test['v']
plot_df['Rbin'] = X_test['Rbin']
plot_df['Y_test'] = Y_test
plot_df['y_test_pred'] = y_test_pred
plot_df = plot_df.sort_values('v')
plot_df.head()

Unnamed: 0,v,Rbin,Y_test,y_test_pred
6422,1.45,116700.0,1577.352056,2126.118365
6848,1.45,169400.0,2649.796834,2930.473078
6849,1.45,113500.0,2067.055422,2142.581532
6850,1.45,119800.0,1258.022526,2114.175652
6851,1.45,122700.0,892.330088,2110.76837


In [329]:
colours = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52',
           '#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52'
           ]
fig = go.Figure()

for val, col in zip(plot_df['v'].unique(), colours):
# for val, col in zip((1.5, 1.6, 1.7, 1.8), colours):    
    data = plot_df.loc[plot_df['v']==val]
    fig.add_trace(go.Scatter(y = data['Y_test'],
                             x = data['Rbin'],
                            #  marker=dict(color='medium turquoise'),
                             marker=dict(color=col),
                             mode='markers',
                             name=f'{val}V'
                             ))
    fig.add_trace(go.Scatter(y = data['y_test_pred'],
                             x = data['Rbin'],
                             line=dict(color='red'),
                             mode='markers',
                             name='prediction'
                             ))
fig.update_layout(
    font_size = 16,
    height = 600,
    title="\u0394R\u03C3 vs Rbin",
    xaxis_title="Rbin",
    yaxis_title="\u0394R\u03C3"
)

In [330]:
plot_df_val = pd.DataFrame()
# validation data
plot_df_val['v'] = x_val['v']
plot_df_val['Rbin'] = x_val['Rbin']
plot_df_val['y_val'] = y_val
plot_df_val['y_val_pred'] = y_val_pred

plot_df_val = plot_df_val.sort_values('v')

In [331]:
colours = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52',
           '#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52'
           ]
fig = go.Figure()

for val, col in zip(plot_df_val['v'].unique(), colours):
    data = plot_df_val.loc[plot_df_val['v']==val]
    fig.add_trace(go.Scatter(y = data['y_val'],
                             x = data['Rbin'],
                             marker=dict(color=col),
                             mode='markers',
                             name=f'{val}V'
                             ))
    fig.add_trace(go.Scatter(y = data['y_val_pred'],
                             x = data['Rbin'],
                             line=dict(color='red'),
                             mode='markers',
                             name=f'{val}V'
                             ))
fig.update_layout(
    title="\u0394R\u03C3 vs Rbin",
    xaxis_title="Rbin",
    yaxis_title="\u0394R\u03C3"
)

In [334]:
fig = go.Figure()

for val1, col1 in zip(plot_df_val['v'].unique(), colours):
    data1 = plot_df_val.loc[plot_df_val['v']==val1]
    # fig.add_trace(go.Scatter(y = data1['y_val'],
    #                          x = data1['Rbin'],
    #                          marker=dict(color='red'),
    #                          mode='markers',
    #                          name=f'{val1}V'
    #                          ))
    fig.add_trace(go.Scatter(y = data1['y_val_pred'],
                             x = data1['Rbin'],
                             marker=dict(color=col1),
                             mode='markers',
                             name=f'{val1}V'
                             ))
for val, col in zip(plot_df['v'].unique(), colours):
    data = plot_df.loc[plot_df['v']==val]
    # fig.add_trace(go.Scatter(y = data['Y_test'],
    #                          x = data['Rbin'],
    #                          marker=dict(color=col),
    #                          mode='markers',
    #                          name=f'{val}V'
    #                          ))
    fig.add_trace(go.Scatter(y = data['y_test_pred'],
                             x = data['Rbin'],
                             marker=dict(color='red'),
                             mode='markers',
                             name='prediction'
                             ))
fig.update_layout(
    title="\u0394R\u03C3 vs Rbin",
    xaxis_title="Rbin",
    yaxis_title="\u0394R\u03C3",
    yaxis_range=[0,10000],
)