In [1]:
import pandas as pd
from typing import List
import numpy as np

# Unique values above/below a threshold

In [2]:
def get_freq(df, cols:List[str], threshold:int):
    
    # Count the frequency of unique strings
    df_freq = df[cols].stack().value_counts()
    
    # Number of unique strings with a count above the threshold
    above_threshold = df_freq[df_freq >= threshold].shape[0]
    
    # Number of unique strings with a count below the threshold
    below_threshold = df_freq[df_freq < threshold].shape[0]
    
    # Percentage of all values that are None
    none_percentage = df[cols].isnull().mean().mean() * 100
    none_percentage = round(none_percentage, 2)
    
    if "reactant_000" in cols or "product_000" in cols:
        return df_freq.shape[0], 0, none_percentage
    else:
        return above_threshold, below_threshold, none_percentage

    


In [3]:
def build_overleaf_table(path, list_of_cols, threshold):
    df = pd.read_parquet(path)
    for cols in list_of_cols:
        above_threshold, below_threshold, none_percentage = get_freq(df, cols, threshold)
        if len(cols) > 4:
            component = "everything"
        else:
            component, _ = cols[0].split("_")
        
    
        table_entry = f"""
        {component} & {above_threshold} // {below_threshold} // {none_percentage} & 
        """
        print(table_entry)
    
    
    

In [4]:
solv_cols = ["solvent_000", "solvent_001"]
catalyst_cols = ["catalyst_000"]
agent_cols = ["agent_000", "agent_001", "agent_002"]
reagent_cols = ["reagent_000", "reagent_001"]
reactant_cols = ["reactant_000", "reactant_001"]
product_cols = ["product_000"]

In [5]:
path = "/Users/dsw46/Projects_local/orderly_07_06/ORDerly/data/orderly/uspto_no_trust/filtered/filtered_orderly_ord.parquet"
threshold = 100
list_of_cols = [reactant_cols, product_cols, solv_cols, agent_cols]
build_overleaf_table(path, list_of_cols, threshold)


        reactant & 317184 // 0 // 18.35 & 
        

        product & 382850 // 0 // 0.0 & 
        

        solvent & 85 // 313 // 28.02 & 
        

        agent & 255 // 11945 // 37.04 & 
        


In [6]:
path = "/Users/dsw46/Projects_local/orderly_07_06/ORDerly/data/orderly/uspto_with_trust/filtered/filtered_orderly_ord.parquet"
threshold = 100
list_of_cols = [reactant_cols, product_cols, solv_cols, catalyst_cols+reagent_cols]
build_overleaf_table(path, list_of_cols, threshold)


        reactant & 40020 // 0 // 25.7 & 
        

        product & 38816 // 0 // 0.0 & 
        

        solvent & 29 // 204 // 40.88 & 
        

        catalyst & 48 // 447 // 56.18 & 
        


In [4]:
path = "/Users/dsw46/Projects_local/orderly_07_06/ORDerly/data/orderly/uspto_no_trust/filtered/filtered_orderly_ord.parquet"
df = pd.read_parquet(path)
df.shape

(411538, 17)

In [2]:
import pandas as pd

In [6]:
df['product_000'].value_counts().shape

(382850,)