In [None]:
!pip install seaborn

In [None]:
!pip install --upgrade mp-api

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from mp_api.client import MPRester

In [None]:
with MPRester() as mpr:
    list_of_available_fields = mpr.summary.available_fields
    print(list_of_available_fields)
    # docs = mpr.summary.search(fields = ['material_id', 'is_metal', 'nsites', 'nelements', 'volume', 'density', 'density_atomic', 'elements', 'composition', 'composition_reduced', 'formula_pretty', 'formula_anonymous', 'chemsys'])

In [None]:
fields = ['nsites', 'nelements', 'volume', 'density', 'density_atomic', 'elements', 'composition', 'composition_reduced', 'formula_pretty', 'formula_anonymous', 'chemsys']
listOfTuples = [(obj.nsites, obj.nelements, obj.volume, obj.density, obj.density_atomic, obj.elements, obj.composition, obj.composition_reduced, obj.formula_pretty, obj.formula_anonymous, obj.chemsys) for obj in docs]
# Create a DataFrame from the list of tuples
df = pd.DataFrame(listOfTuples, columns=fields)

In [None]:
df.describe()

In [None]:
filtered_df_volume = df[df['volume'] <= 3000]
filtered_df_a_density = df[df['density_atomic'] <= 7]
filtered_df_a_density = df[df['density_atomic'] <= 1500]

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))
df['nsites'].plot(kind='hist', ax=axes[0, 0], title='nsites')
df['nelements'].plot(kind='hist', ax=axes[0, 1], title='nelements')
filtered_df_volume['volume'].plot(kind='hist', ax=axes[0, 2], title='volume')
df['density'].plot(kind='hist', ax=axes[1, 0], title='density')
df['density_atomic'].plot(kind='hist', ax=axes[1, 1], title='density_atomic')
df['density_atomic'].plot(kind='hist', range=(df['density_atomic'].min(), 100), title='density_atomic_focused')


In [None]:
filtered_df_a_density['density_atomic'].plot(kind='hist', range=(df['density_atomic'].min(), 1000), ax=axes[1, 1], title='density_atomic_focused')

In [None]:
### plt.figure(figsize=(8, 6))
plt.boxplot(df['volume'], vert=False)
plt.title('Box and Whisker Plot for Volume')
plt.xlabel('Volume')
plt.show()

plt.figure(figsize=(8, 6))
plt.boxplot(df['density'], vert=False)
plt.title('Box and Whisker Plot for Density')
plt.xlabel('Density')
plt.show()

plt.figure(figsize=(8, 6))
plt.boxplot(df['density_atomic'], vert=False)
plt.title('Box and Whisker Plot for Atomic Density')
plt.xlabel('Atomic Density')
plt.show()

In [None]:
df.info()
numerical_columns = df.select_dtypes(include=['int64', 'float64'])
correlation_matrix = numerical_columns.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=.5)
plt.title('Correlation Heatmap')
plt.show()

In [None]:

# Scatter plot of Volume vs. Density
plt.figure(figsize=(10, 6))
sns.scatterplot(x='volume', y='density', data=df)
plt.title('Scatter Plot of Volume vs. Density')
plt.xlabel('Volume')
plt.ylabel('Density')
plt.show()

In [None]:
exploded_list = df['elements'].explode()
value_counts_result = exploded_list.value_counts()
print(value_counts_result)

plt.figure(figsize=(20, 20))
sns.countplot(y=exploded_list, order=exploded_list.value_counts().index, palette='viridis')
plt.xlabel('Count')
plt.ylabel('Unique Values')
plt.title('Value Counts of Exploded List')
plt.show()


In [None]:
symm_docs = mpr.summary.search(fields = ['is_stable'])

In [None]:
fieldsSym = ['is_stable']
listOfTuplesSymm = [(obj.is_stable) for obj in symm_docs]
# Create a DataFrame from the list of tuples
df_Symm = pd.DataFrame(listOfTuplesSymm, columns=fieldsSym)
explListSymm=df_Symm['is_stable'].explode()
order=explListSymm.value_counts()
print(order)

plt.figure(figsize=(10, 8))
order.plot(kind='bar', color='skyblue')
plt.xlabel('Is stable?')
plt.ylabel('Unique Values')
plt.title('Count of stable materials')
plt.show()

### Making Cif Files from structures queried from API

In [None]:
mp_1077102 = mpr.summary.search(material_ids=["mp-1077102"], fields=["structure"])

In [None]:
TmSn2=mp_1077102[0]
TmSn2.structure
TmSn2.structure.to(fmt="cif", filename="mp_1077102_cif")

### Using xtal2png to generate CrysTens representation

In [None]:
!pip install xtal2png
from xtal2png.utils.data import example_structures
from xtal2png.core import XtalConverter
xc = XtalConverter()
example_structures[1]

In [None]:
xc = XtalConverter(save_dir="data") # DFT surrogate relaxation via m3gnet by default
data = xc.xtal2png(example_structures, save=True)
relaxed_decoded_structures = xc.png2xtal(data, save=False)

xc = XtalConverter(save_dir="data")
data = xc.xtal2png(example_structures, save=True)
decoded_structures = xc.png2xtal(data, save=False)

In [None]:
import glob, os
from PIL import Image
for fpath in glob.glob("data/*.png"):
    with Image.open(fpath) as im:
      im = im.resize((64*5, 64*5), Image.BOX)
      print(fpath)
      display(im)

In [None]:
!pip install ase nglview

In [None]:
from pymatgen.io.ase import AseAtomsAdaptor
from ase.visualize import view

aaa = AseAtomsAdaptor()
[display(view(aaa.get_atoms(s), viewer='ngl')) for s in example_structures]

### Getting Electron Charge Density data

Retrieving Material Project IDs of all strucutres with electron charge density data. Query AWS OpenData for the Materials Project from the AWS CLI with the query `aws s3 ls --no-sign-request s3://materialsproject-parsed/chgcars/ > MaterialIDWithChargeData.csv`. This retried the list of all the materials that have electron charge density data and stored it to a CSV file.

#### Extracting the material IDs from the returned file

In [None]:
mp_ids = []
i=0
with open("MaterialIDWithChargeData.txt", "r") as file:
    for line in file:
        # Split each line into tokens
        tokens = line.split()
        
        # Extract the material ID (assuming it's always the last token)
        if i<100: print(line)
        material_id = tokens[-1]
        material_id = material_id.split(".")[0].replace('\x00', '').encode('utf-8').decode('utf-8')
        material_id=str(material_id)
        # print(material_id)
        
        # Append the material ID to the list
        if material_id!="":
            mp_ids.append(material_id)
        i+=1

print(mp_ids[:100])

#### Load the return file with the data about the files in Materials Project AWS S3 bucket, and sort the df by size, with the intention to fetch and use the smallest files to save space

In [None]:
import pandas as pd

# Replace 'your_file.csv' with the actual file name
input_file = 'MaterialIDWithChargeData.csv'

# Read the CSV file with tab as the separator and provide column names
MaterialIDWithChargeDataDF = pd.read_csv(input_file, sep='\t', header=None, names=['Date', 'Time', 'Size', 'Material_ID'], encoding='utf-16')

# Assuming you want to sort by the 'Material_ID' column
MaterialIDWithChargeDataSortedAscendingDF = MaterialIDWithChargeDataDF.sort_values(by='Size', ascending=True)

MaterialIDWithChargeDataSortedAscendingDF.head()
# Save the sorted DataFrame to a new CSV file
len(MaterialIDWithChargeDataSortedAscendingDF)

#### Getting the names of which files to get so that the total is 500 mb 

In [None]:
goalSizeOfDB=3e9 #1 gb in bytes

total_size = 0
rows_to_include = []

# Iterate through the DataFrame
for index, row in MaterialIDWithChargeDataSortedAscendingDF.iterrows():
    # Assuming 'Size' is the column representing file size in each row
    size = row['Size']
    
    # Check if adding the current row's size exceeds 1000 MB
    if size!=0 and total_size + size <= goalSizeOfDB:
        total_size += size
        rows_to_include.append(index)
    elif total_size + size <= goalSizeOfDB and size==0:
        continue
    else:
        break  # Stop iterating once the total size exceeds 1000 MB
print(f"Number of ECD Files to download: {(len(rows_to_include))}")
print(f"Number of Images availible: {(len(rows_to_include))*64}") ##amount of pictures we'll be able to use in the first step of simply trying to get the model to generate sth.
# Create a new DataFrame containing only the selected rows
rowsInDataCapacityMaterialsWithChargeData = MaterialIDWithChargeDataSortedAscendingDF.loc[rows_to_include]
print(f"rowsInDataCapacityMaterialsWithChargeData len: {len(rowsInDataCapacityMaterialsWithChargeData)}")
#Get the material IDs to be downloaded
Material_ID_Column=rowsInDataCapacityMaterialsWithChargeData['Material_ID']
IDsToDownloadDataFor=[x.split(".")[0] for x in Material_ID_Column]
print(IDsToDownloadDataFor[:100])
# Print or use 'selected_rows_df' as needed
print(rowsInDataCapacityMaterialsWithChargeData)
awsQueries=[f"aws s3 cp --no-sign-request s3://materialsproject-parsed/chgcars/{m_id} baseData/ecd/raw_ecd_data/{m_id}" for m_id in rowsInDataCapacityMaterialsWithChargeData['Material_ID']]
bash_script_path = "awsECDDataFetcher.sh"

# Write the commands to the bash script file
with open(bash_script_path, "w") as bash_script:
    for query in awsQueries:
        bash_script.write(f"{query}\n")

### Main Code: Getting the CHGCAR files and extracting and saving the images

In [None]:
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

desired_image_size = (256, 256)

for mID in IDsToDownloadDataFor:
    with MPRester() as mpr:
        charge_density_total_x = mpr.get_charge_density_from_material_id(mID).data['total']
    
    for x, density_frame in enumerate(charge_density_total_x):
        if np.std(density_frame) > 2:
            normalized_density = ((density_frame - np.min(density_frame)) /
                                  (np.max(density_frame) - np.min(density_frame))) * 255
            normalized_density = normalized_density.astype(np.uint8)
            
            resized_image = Image.fromarray(normalized_density).resize(desired_image_size, Image.LANCZOS)
            resized_image.save(f'ECDDATAEMERGENCY/{mID}_frame_{x}.png')


In [None]:
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

common_image_size = (100, 100)

for mID in IDsToDownloadDataFor:
    charge_density_total_x = mpr.get_charge_density_from_material_id(mID).data['total']
    
    for x, density_frame in enumerate(charge_density_total_x):
        if np.std(density_frame) > 2:
            normalized_density = ((density_frame - np.min(density_frame)) /
                                  (np.max(density_frame) - np.min(density_frame))) * 255
            normalized_density = normalized_density.astype(np.uint8)
            
            resized_image = Image.fromarray(normalized_density).resize(common_image_size, Image.LANCZOS)
            
            plt.imshow(resized_image, cmap='viridis')
            plt.show()
            plt.axis('off')
            print(np.std(charge_density_total_x[x]))
            plt.savefig(f'{mID}_frame_{x}')



In [None]:
from PIL import Image

print(f"Length of IDsToDownloadDataFor list: {len(IDsToDownloadDataFor)}")
for mID in IDsToDownloadDataFor:
    print(IDsToDownloadDataFor.index(mID))
    charge_density_total_x=mpr.get_charge_density_from_material_id(mID).data['total']
    print(len(charge_density_total_x))
    common_image_size = (100, 100)
    for x in range(len(charge_density_total_x)):
    # Plot the charge density
        if np.std(charge_density_total_x[x]) > 0:
            charge_density_image = Image.fromarray(charge_density_total_x[x])
            charge_density_image_resized = charge_density_image.resize(common_image_size, Image.LANCZOS)
            plt.imshow(charge_density_image_resized, cmap='viridis')
            plt.show()
            print(np.std(charge_density_total_x[x]))
        # plt.savefig(f'{mID}_frame_{x}')

    

A bash script is created which downloads all the .gz ECD files from the AWS S3 Bucket. We then use `tar -zxvf *.gz` from the terminal in the folder containing these files to unzip them and reveal the `.json` files that actually contain the data

#### Trying to save the VASP files so that images can be created from them later

In [None]:
chgcar = mpr.get_charge_density_from_material_id("mp-149")
charge_density.write_file("mp-149_chgcar.vasp")

In [None]:
print(chgcar)

In [None]:
stableMaterialsIDDocs = mpr.summary.search(is_stable=True, fields = ["material_id"])

In [None]:
StableMaterialsIDList=[str(x.material_id[:]) for x in stableMaterialsIDDocs]
StableMaterialsIDList

In [None]:
x=[int(item[3:]) for item in StableMaterialsIDList]
countList=[1 if item>=1523378 and item <=2913383 else 0 for item in x]
print(sum(countList))

In [None]:
from collections import Counter
# Count occurrences of each element in the list
StableMaterialsIDList_counts = Counter(StableMaterialsIDList)
mp_ids_counts = Counter(mp_ids)

# Print duplicates
print("Duplicates in the StableMaterialsIDList_counts:")
for element, count in StableMaterialsIDList_counts.items():
    if count > 1:
        print(f"{element} appears {count} times.")
        
print("Duplicates in the mp_ids_counts:")
for element, count in mp_ids_counts.items():
    if count > 1:
        print(f"{element} appears {count} times.")
print(f"len of mp_ids as list: {len(mp_ids)}")
print(f"len of mp_ids as set: {len(set(mp_ids))}")
print(f"len of StableMaterialsIDList_counts as list: {len(StableMaterialsIDList_counts)}")
print(f"len of StableMaterialsIDList_counts as set: {len(set(StableMaterialsIDList_counts))}")

In [None]:
StableMaterialsWithECDData = list(set(mp_ids) & set(StableMaterialsIDList))
print(StableMaterialsWithECDData)
len(StableMaterialsWithECDData)

In [None]:
# listt = ["mp-7891"]
listOfMostActiveFramePerMolecule = []

for material_id in StableMaterialsIDList:
    print(material_id)
    charge_density_total = mpr.get_charge_density_from_material_id(material_id).data['total']
    max_std_index = np.argmax(np.std(charge_density_total, axis=1))
    listOfMostActiveFramePerMolecule.append(charge_density_total[max_std_index])

print(listOfMostActiveFramePerMolecule)


In [None]:
charge_density

In [None]:
import os

current_directory = os.getcwd()
print("Current Directory:", current_directory)


In [None]:
import pymatgen.core.structure
import matplotlib.pyplot as plt
from pymatgen.io.vasp.outputs import Chgcar
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import numpy as np

# Replace 'path/to/your/OUTCAR' with the actual path to your VASP OUTCAR file containing ECD data
chgcar_file_path = 'C:\\Users\\91931\\~\\diss\\vaspFilesECDData\\mp-1523401_chgcar.vasp'   ##THE PATH IS NOT WORKING

# Load the structure from the OUTCAR file
chgcar = Chgcar.from_file(chgcar_file_path)
structure = chgcar.structure
# structure = Structure.from_file(outcar_file_path, 'OUTCAR')

all_properties = list(structure.site_properties.keys())
print(all_properties)
# Extract ECD data from the OUTCAR file
# Replace 'your_property_name' with the actual name of the ECD property you want to visualize
ecd_data = structure.site_properties['charge_density']

# Plot the ECD data
plt.plot(ecd_data)
plt.xlabel('Step')
plt.ylabel('ECD Property')
plt.title('ECD Property vs. Step')
plt.show()


In [None]:
from pymatgen.io.vasp.outputs import Chgcar
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
import numpy as np

# Load the Chgcar object from your data (replace 'path_to_chgcar' with the actual path or data)
# chgcar = charge_density

# Get the charge density data
charge_density_total = charge_density.data['total']

stdevs=[]
for x in range(64):
# Plot the charge density
    plt.imshow(charge_density_total[x], cmap='viridis')
    plt.savefig(f'chargeDensityImage_Slice{x}.png')
    plt.show()
    print(np.std(charge_density_total[x]))
    stdevs.append(np.std(charge_density_total[x]))

print(np.max(stdevs))
plt.imshow(charge_density[np.argmax(stdevs)], cmap='viridis')
plt.show()



# plt.colorbar(label='Charge Density')
# plt.title('Electron Charge Density')
# plt.xlabel('X-axis')
# plt.ylabel('Y-axis')

# Save the plot as PNG (replace 'output_path.png' with your desired output path)
# plt.savefig('chargeDensityImage.png')

# Show the plot (optional)
plt.show()


In [None]:
from pymatgen.io.vasp.outputs import Chgcar
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from mpl_toolkits.mplot3d import Axes3D
import numpy as np

# Load the Chgcar object from your data (replace 'path_to_chgcar' with the actual path or data)
chgcar = charge_density

# Get the charge density data
charge_density1 = charge_density

# Create a meshgrid for 3D plotting
x, y, z = [np.arange(dim) for dim in charge_density1.shape]

# Plot the 3D charge density
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

# The rstride and cstride parameters control the row and column stride of the plot
ax.plot_surface(x, y, z, facecolors=plt.cm.viridis(charge_density1 / charge_density1.max()), rstride=5, cstride=5, alpha=0.7, norm=LogNorm())

# Customize the plot
ax.set_xlabel('X-axis')
ax.set_ylabel('Y-axis')
ax.set_zlabel('Z-axis')
ax.set_title('3D Electron Charge Density')

# Save the plot as PNG (replace 'output_path.png' with your desired output path)
plt.savefig('3D_Electron_density.png')

# Show the plot (optional)
plt.show()


In [None]:
from mp_api.client import MPRester

with MPRester() as mpr:
        docs = mpr.get_charge_density_from_material_id("mp-149")

In [None]:
xx = [x for x in docs if x.energy_above_hull <= 0.1]

In [None]:
len(xx)

In [None]:
docs = mpr.summary.search(material_ids=["mp-149"])

In [None]:
docs[0].structure.lattice.a