# Codes that are potentially useful in the future

In [None]:
# List bond angles

from src.local_features import NMR_local

structure = structure_tensors_filtered[17]["structure"]
first_coord_dict = NMR_local(structure).first_neighbours
first_coord_dict

import itertools
from pymatgen.util.coord import get_angle

center_index = 2
first_neighbours = first_coord_dict[center_index]
site_combos = itertools.combinations(first_neighbours, 2)
angles_differences = []
for combo in site_combos:
    v1 = combo[0]["site"].coords - structure[int(center_index)].coords
    v2 = combo[1]["site"].coords - structure[int(center_index)].coords
    angles_differences.append(get_angle(v1, v2, units="degrees"))
angles_differences.sort(reverse=True)
angles_differences

In [None]:
# chemical environment of a site

from pymatgen.analysis.chemenv.coordination_environments.coordination_geometry_finder import (
    LocalGeometryFinder,
)
from matminer.featurizers.site.fingerprint import ChemEnvSiteFingerprint
from pymatgen.analysis.chemenv.coordination_environments.chemenv_strategies import (
    MultiWeightsChemenvStrategy,
)

cetypes = [
    "TL:3",
    "TY:3",
    "TS:3",
    "T:4",
    "S:4",
    "SY:4",
    "SS:4",
    "PP:5",
    "S:5",
    "T:5",
    "O:6",
    "T:6",
    "PP:6",
]
lgf = LocalGeometryFinder()
lgf.setup_parameters(
    centering_type="centroid",
    include_central_site_in_centroid=True,
    structure_refinement=lgf.STRUCTURE_REFINEMENT_NONE,
)
envfingerprint = ChemEnvSiteFingerprint(
    cetypes,
    MultiWeightsChemenvStrategy.stats_article_weights_parameters(),
    lgf,
)
labels = envfingerprint.feature_labels()
dict(
    zip(
        labels, envfingerprint.featurize(structure_tensors_filtered[15]["structure"], 4)
    )
)

In [None]:
r_result = {}
for n in range(10):
    X_train, X_test, y_train, y_test = train_test_split(
        normalized_x, y, test_size=0.2, random_state=n
        )

    # create and fit a kernel ridge regression model
    model = RandomForestRegressor(random_state=10,min_samples_split=4,min_samples_leaf=2)

    param = {"n_estimators": randint(low=100, high=500), 
            "max_depth": uniform(10, 200),
            "max_features": ['auto', 'sqrt','log2']
            }

    grid = RandomizedSearchCV(
        estimator=model,
        param_distributions=param,
        n_iter=5,
        scoring=["neg_mean_absolute_error", "neg_mean_squared_error", "r2"],
        refit="r2",
        cv=5,
        n_jobs=8,
    )
    grid.fit(X_train.drop("max_ce",axis=1), y_train['CQ'])

    y_rf = grid.predict(X_test.drop("max_ce",axis=1))
    test_r2 = r2_score(y_test["CQ"], y_rf)

    r_result[n]=test_r2
r_result

In [None]:
## 4.Feature importance and feature selection 

In [None]:
model = grid.best_estimator_
importance = model.feature_importances_
feat_labels = X_train.drop("max_ce",axis=1).columns

feat_imp=[]
for feat, imp in zip(feat_labels,importance):
    feat_imp.append((feat,imp))
feat_imp = pd.DataFrame(feat_imp,columns=['feature name','importance'])
feat_imp.sort_values('importance',ascending=False).iloc[:10,:]

In [None]:
from sklearn.feature_selection import SelectFromModel
# Create a selector object that will use the random forest regressor to identify
# features that have an importance of more than 0.15
sfm = SelectFromModel(model, threshold=0.15)

# Train the selector
sfm.fit(X_train.drop("max_ce",axis=1), y_train['CQ'])

for feature_list_index in sfm.get_support(indices=True):
    print(feat_labels[feature_list_index])