In [None]:
# import libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import cdist


# Step One: Graphing and Visualization

In [None]:
# making pandas dataframe
original_df = pd.read_csv('data.csv')

original_df

In [None]:
# creates and visualizes a plot of the x and y coordinates of the values
plt.scatter(original_df['X'], original_df['Y'])
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Scatter Plot')

plt.show()

In [None]:
# overlays visualization of the values of the SCFshift according to magnitude
plt.scatter(original_df['X'], original_df['Y'], c=original_df['SCFshift'], cmap='viridis', alpha=0.8)

plt.xlabel('X')
plt.ylabel('Y')
plt.title('Scatter Plot with Gradient')

plt.colorbar(label='SCFshift')

plt.show()


# Step Two: Polynomial Regresssion Model

In [None]:
# determines the correlation between the SCFshift and the other values
list = original_df.corr(numeric_only=True).abs()[["SCFshift"]]
list

In [None]:
# creates a new dataframe with the selected columns
selected_columns = ['G3-2 R_cut = 8', 'G3-2 R_cut = 9', 'G3-2 R_cut = 7', 'G3-2 R_cut = 10', 'G3-1 R_cut = 7']
normalized_df = original_df[selected_columns]

# normalizes the values in the new dataframe
for column in selected_columns:
    min_val = np.min(normalized_df[column])
    max_val = np.max(normalized_df[column])
    normalized_df[column] = (normalized_df[column] - min_val) / (max_val - min_val)

normalized_df

# Step Three: Random Clustering

To answer these questions using clustering, you can perform the following steps:

* Select a subset of points from your list randomly or use an initial set of diverse points.  
* Apply a clustering algorithm (e.g., k-means, DBSCAN) to cluster these points based on their features.  
* Analyze the clusters and select representative points from each cluster.  
* Perform DFT calculations for the selected representative points and compute the corresponding adsorption energies.  
* Fit a linear regression model using the computed energies and the features of the representative points.  
* valuate the model's performance metrics (e.g., RMSE, R-squared) to assess its accuracy. 

In [None]:
import warnings
warnings.filterwarnings("ignore")

def calculate_clusters(df, n_clusters):
    points = df.values
    kmeans = KMeans(n_clusters=n_clusters)
    labels = kmeans.fit_predict(points)
    centroids = kmeans.cluster_centers_
    return labels, points, centroids, kmeans

labels, points, centroids, kmeans = calculate_clusters(normalized_df, 17)

In [None]:
# Get the centroids

closest_indices = []

# Iterate over each centroid and find the index of the closest point
for centroid in centroids:
    centroid_distances = [np.linalg.norm(point - centroid) for point in points]
    closest_index = np.argmin(centroid_distances)
    closest_indices.append(closest_index)

closest_indices
# Get the corresponding variables of the closest points
closest_points = normalized_df.iloc[closest_indices]

# Create a Pandas DataFrame with the closest points
closest_points_df = pd.DataFrame(closest_points, columns=normalized_df.columns)

# Get the corresponding values of the closest SCF
closest_scf = original_df.iloc[:, 3].iloc[closest_indices]

# Create a Pandas DataFrame with the closest SCF
closest_scf_df = closest_scf.to_frame()

In [None]:
def regression_model(X_value, y_value, _degree):
    # training our model on our cluster centroids and their corresponding SCFshift values
    X_train = X_value
    y_train = y_value
    
    X_test = normalized_df[~normalized_df.isin(X_train)].dropna()
    y_test = original_df['SCFshift'].to_frame()[~original_df['SCFshift'].to_frame().isin(y_train)].dropna()
    
    # Create polynomial features
    poly_features = PolynomialFeatures(degree=_degree)
    X_train_poly = poly_features.fit_transform(X_train)
    X_test_poly = poly_features.transform(X_test)

    # Create and train the polynomial regression model
    poly_regression_model = LinearRegression()
    poly_regression_model.fit(X_train_poly, y_train)

    # Generate predictions for the test dataset
    y_pred = poly_regression_model.predict(X_test_poly)

    # Calculate reliability score (R-squared)
    mean_error = mean_absolute_error(y_test, y_pred)

    # Print the results
    return mean_error

# Initialize lists to store cluster values and corresponding errors
cluster_values = []
error_values = []

# Iterate over different values of the number of clusters
for num_clusters in range(2, 16):
    labels, points, centroids, kmeans = calculate_clusters(normalized_df, num_clusters)

    # Get the centroids

    closest_indices = []

# Iterate over each centroid and find the index of the closest point
    for centroid in centroids:
        centroid_distances = [np.linalg.norm(point - centroid) for point in points]
        closest_index = np.argmin(centroid_distances)
        closest_indices.append(closest_index)
    print(closest_indices)
# Get the corresponding variables of the closest points
    closest_points = normalized_df.iloc[closest_indices]

# Create a Pandas DataFrame with the closest points
    closest_points_df = pd.DataFrame(closest_points, columns=normalized_df.columns)

# Get the corresponding values of the closest SCF
    closest_scf = original_df.iloc[:, 3].iloc[closest_indices]

# Create a Pandas DataFrame with the closest SCF
    closest_scf_df = closest_scf.to_frame()
    # Call the regression model function
    error = regression_model(closest_points_df, closest_scf_df, num_clusters)
    
    # Append the cluster value and error to the respective lists
    cluster_values.append(num_clusters)
    error_values.append(error)

# Plotting the error values
plt.plot(cluster_values, error_values)
plt.xlabel('Number of Clusters')
plt.ylabel('Error')
plt.title('Error vs Number of Clusters')
plt.show()

# Step Four: Alternate Alglorithms

In [None]:
from sklearn.metrics import accuracy_score

def calculate_wcss(centroid, data_points):
    distances = np.sum((data_points - centroid) ** 2, axis=1)
    wcss = np.sum(distances)
    return wcss

X_values = normalized_df

cluster_accuracy_per_iteration = np.array()

for clusterNum in range(0, 14):
    cluster_accuracy_per_iteration[clusterNum] = calculate_wcss()

# Prediction Model

# Step One: Graphing and Visualization

In [None]:
# creates and visualizes a plot of the x and y coordinates of the values
plt.scatter(original_df['X'], original_df['Y'])
plt.xlabel('X')
plt.ylabel('Y')
plt.title('Scatter Plot')

plt.show()

# Step Two: Polynomial Regresssion Model

In [None]:
# creates a new dataframe with the selected columns
selected_columns = ['G3-2 R_cut = 8', 'G3-2 R_cut = 9', 'G3-2 R_cut = 7', 'G3-2 R_cut = 10', 'G3-1 R_cut = 7']
normalized_df = original_df[selected_columns]

# normalizes the values in the new dataframe
for column in selected_columns:
    min_val = np.min(normalized_df[column])
    max_val = np.max(normalized_df[column])
    normalized_df[column] = (normalized_df[column] - min_val) / (max_val - min_val)

normalized_df

To answer these questions using clustering, you can perform the following steps:

* Select a subset of points from your list randomly or use an initial set of diverse points.  
* Apply a clustering algorithm (e.g., k-means, DBSCAN) to cluster these points based on their features.  
* Analyze the clusters and select representative points from each cluster.  
* Perform DFT calculations for the selected representative points and compute the corresponding adsorption energies.  
* Fit a linear regression model using the computed energies and the features of the representative points.  
* valuate the model's performance metrics (e.g., RMSE, R-squared) to assess its accuracy. 

In [None]:
# Get the centroids

closest_indices = []

# Iterate over each centroid and find the index of the closest point
for centroid in centroids:
    centroid_distances = [np.linalg.norm(point - centroid) for point in points]
    closest_index = np.argmin(centroid_distances)
    closest_indices.append(closest_index)

closest_indices
# Get the corresponding variables of the closest points
closest_points = normalized_df.iloc[closest_indices]

# Create a Pandas DataFrame with the closest points
closest_points_df = pd.DataFrame(closest_points, columns=normalized_df.columns)

# Get the corresponding values of the closest SCF
closest_scf = original_df.iloc[:, 3].iloc[closest_indices]

# Create a Pandas DataFrame with the closest SCF
closest_scf_df = closest_scf.to_frame()

# Step Four: Alternate Alglorithms