In [2]:
import pandas as pd
import numpy as np
import os

# S.1.2.1 Data Aggregation

1. [Combining Patents](#combining-all-patents-to-one-df)
2. [Aggregating Data](#aggregating-data)

#### Combining all Patents to one DF

In [3]:

def list_files_in_folder(folder_path):
    # List all files in the specified folder
    file_names = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    return file_names


folder_path = r'../parsed_dataframes'
files = list_files_in_folder(folder_path)

df = pd.DataFrame()

for file in files:
    df = pd.concat([df, pd.read_excel(folder_path+r"/"+file)], axis=0, ignore_index=True)
df = df.drop(columns = "Unnamed: 0")

properties_df = pd.read_excel("target_compound_properties.xlsx")


In [4]:
# Filtering junk entries
df=df[df["temperatures_c"]<200000]

In [5]:
df = df.merge(properties_df, on='CAS', how='left')

In [6]:
df["pressures_bar"]=df["pressures_mpa"]*10
df = df.drop(columns = "pressures_mpa")
df.to_excel("combined_patent_entries.xlsx", index=False)

#### Aggregating data

In [9]:
# Aggregating df into groups with patent_count and average_temp

ag = df.drop(columns=["reagants", "catalysts", 'solvents']).groupby("CAS").agg(
    temp_c_avg=('temperatures_c', 'mean'),
    temp_c_median=('temperatures_c', 'median'),
    pressure_bar_avg=('pressures_bar', 'mean'),
    pressure_bar_median=('pressures_bar', 'median'),
    enthalpy_average = ('enthalpy_rxn', 'mean'),
    entropy_average = ('entropy_rxn', 'mean'),
    patent_count=('CAS', 'size')  # count the number of entries per group
)


In [10]:
# Adding production and chemical name data from EPA dataset

pvs = pd.read_excel("CAS_to_production_volume.xlsx")
ag["Name"] = pvs[pvs["CAS"].isin(list(ag.index))].sort_values("CAS")["Name"].values
ag["Production"] = pvs[pvs["CAS"].isin(list(ag.index))].sort_values("CAS")["High Production"].values

In [11]:
ag["temp_reports"] = pd.Series(dtype=int)
ag["pressure_reports"] = pd.Series(dtype=int)
for cas in ag.index:
    mini = df[df["CAS"]==cas]
    ag.loc[cas, "temp_reports"] = len(mini.dropna(subset="temperatures_c"))
    ag.loc[cas, "pressure_reports"] = len(mini.dropna(subset="pressures_bar"))
ag["frac_of_prod"] = ag["Production"]/np.sum(ag["Production"].values)

In [12]:
ag = ag[["Name", "Production", "frac_of_prod", "patent_count", "temp_reports", "temp_c_avg", "temp_c_median","pressure_reports", "pressure_bar_avg", "pressure_bar_median", 'enthalpy_average', 'entropy_average']]
ag = ag.merge(properties_df, on='CAS', how='left')

In [13]:
ag.to_excel("grouped_data.xlsx", index=False)