# 27Al CQ prediction with SOAP features

In [None]:
import json
import pandas as pd
import numpy as np
import seaborn as sns
from pymatgen.core.structure import Structure as ST
from src.soap_utilities import getXY, get_species

from scipy import sparse

### preprocessing and generate soap feature
------------------------------------------------------------

In [None]:
# only show warnings once
import warnings

warnings.simplefilter("once")

In [None]:
# Read processed data and continue
with open("./data/processed_data_0.5.json", "r") as file:
    data_reload = json.load(file)
for data in data_reload:
    data["structure"] = ST.from_dict(data["structure"])
print("length of data set is:", len(data_reload))

In [None]:
# Atom Species
species_list, compositions = get_species(data_reload)
print("num of species in the data set:", len(species_list))

In [None]:
X, y, loc = getXY(data_reload, species_list)
print(X.shape, y.shape)

In [None]:
# Cache X and y
sparse.save_npz("./data/soap_X.npz", X)
y[["nmr"]].to_csv("./data/soap_y.csv")

### PCA of X
----------------------------------------------

In [None]:
# reload X
X = sparse.load_npz("./data/soap_X.npz")

In [None]:
from sklearn.decomposition import TruncatedSVD

N = 35
pca = TruncatedSVD(n_components=N, n_iter=8, random_state=20)
X_pca_fit = pca.fit(X)
X_pca = X_pca_fit.transform(X)

print(f"PCA covers {sum(X_pca_fit.explained_variance_ratio_)}% of variance")
print(f"PCA done! New shape {X_pca.shape}")

In [None]:
# cache X_pca
np.savetxt("./data/soap_X_pca.csv", X_pca, delimiter=",")

### Train and test of random forest model
----------------------------------------------------

In [None]:
# Get X_pca and y
X_pca = np.loadtxt("./data/soap_X_pca.csv", delimiter=",")
y = pd.read_csv("./data/soap_y.csv")[["nmr"]]

In [None]:
# Get absolute value of CQ
y["nmr"] = abs(y["nmr"])

# Train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y, test_size=0.2, random_state=20
)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [None]:
%%time
# Grid Search for Algorithm Tuning
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
import math

# create and fit a kernel ridge regression model
model = RandomForestRegressor(random_state=10, min_samples_split=4, min_samples_leaf=2)

param = {
    "n_estimators": randint(low=100, high=500),
    "max_depth": randint(low=10, high=100),
    "max_features": ["sqrt", "log2"],
}

grid = RandomizedSearchCV(
    estimator=model,
    param_distributions=param,
    n_iter=10,
    scoring=["neg_mean_absolute_error", "neg_mean_squared_error", "r2"],
    refit="r2",
    cv=5,
    n_jobs=8,
)

grid.fit(X_train, y_train["nmr"])

# summarize the results of the grid search
train_r2 = np.sort(grid.cv_results_["mean_test_r2"])[-1]
train_RMSE = math.sqrt(
    -np.sort(grid.cv_results_["mean_test_neg_mean_squared_error"])[-1]
)
train_MAE = -np.sort(grid.cv_results_["mean_test_neg_mean_absolute_error"])[-1]

print(
    "training score: R2 = {}, RMSE = {}, MAE = {}".format(
        train_r2, train_RMSE, train_MAE
    )
)
print(grid.best_estimator_)

In [None]:
%%time
# Predict test set
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from src.Utility import reg_plot

sns.set()

y_rf = grid.predict(X_test)

test_r2 = r2_score(y_test["nmr"], y_rf)
test_RMSE = math.sqrt(mean_squared_error(y_test["nmr"], y_rf))
test_MAE = mean_absolute_error(y_test["nmr"], y_rf)

print("test scores: R2 = {}, RMSE = {}, MAE = {}".format(test_r2, test_RMSE, test_MAE))


# plot
reg_plot(
    y_test["nmr"], y_rf, "VASP calculated CQ(MHz)", "Random forest predicted CQ (MHz)"
)

### Build learn curve base on sample size

In [None]:
# reload X_pca and y
X_pca = pd.read_csv("./data/soap_X_pca.csv", header=None)
y = pd.read_csv("./data/soap_y.csv")[["nmr"]]
y["nmr"] = abs(y["nmr"])
y.rename(columns={"nmr": "CQ"}, inplace=True)

In [None]:
# get a series of smaller data sets (10%-100%)
whole_dataset = pd.concat([y, X_pca], axis=1)

small_sets = []
for p in range(1, 11):
    small_sets.append(whole_dataset.sample(frac=p / 10, random_state=20))

In [None]:
# Build learning curve
from src.Utility import learning_curve_samplesize

model = RandomForestRegressor(
    random_state=10,
    min_samples_split=4,
    min_samples_leaf=2,
    max_depth=50,
    n_estimators=500,
    max_features="sqrt",
)
feature_names = list(range(35))
learning_curve_dict = learning_curve_samplesize(model, small_sets, feature_names)
pd.DataFrame(learning_curve_dict)

In [None]:
# save the result
pd.DataFrame(learning_curve_dict).to_csv("./data/soap_learning_curve_samplesize.csv")