In [None]:
import pandas as pd
import pprint
import numpy as np
pd.set_option('display.max_colwidth',1000)
pd.options.display.max_columns = 2000
pd.options.display.max_rows = 2000


In [None]:
def k_char_match(patter_str, compare_str, ignore_patterns=('disease',
                                                           'effective', 
                                                           'maximum', 
                                                           'minimum',
                                                           'active',
                                                           'total'), 
                 k=5):
    """Checks to see if pattern of length k from the pattern string are in the comparison string """
    for i in range(len(patter_str)):
        if (i+k) <= len(patter_str):
            pattern = patter_str[i:(i+k)].lower().replace(" ", "")
            ignore=False
            for ignore_pattern in ignore_patterns:
                if pattern in ignore_pattern:
                    ignore=True
                    break
            if not ignore:
                if pattern in compare_str.lower().replace(" ", ""):
                    return pattern

### Load features from our model and from papers

In [None]:
# model features parquet file
model_df_file = "/Users/arvin/Documents/ucla research/CRRT project/data_files/df_[startdate-7d,startdate].parquet"

In [None]:
model_df = pd.read_parquet(model_df_file, engine='pyarrow')

In [None]:
model_cols = list(model_df.columns)

In [None]:
# paper features spreadsheet
rw_csv = "/Users/arvin/Downloads/CRRT Related Work - Sheet1.csv"

In [None]:
rw_df = pd.read_csv(rw_csv, index_col=None)

In [None]:
rw_df["List Baseline Features"] = rw_df["Baseline Features"].str.split(",")
rw_df["List Baseline Features"] = rw_df["List Baseline Features"].apply(lambda d: d if isinstance(d, list) else [])
rw_df["List Vital Signs"] = rw_df["Vital Signs"].str.split(",")
rw_df["List Vital Signs"] = rw_df["List Vital Signs"].apply(lambda d: d if isinstance(d, list) else [])
rw_df["List Lab Results"] = rw_df["Lab Results"].str.split(",")
rw_df["List Lab Results"] = rw_df["List Lab Results"].apply(lambda d: d if isinstance(d, list) else [])
rw_df["All Features"] = rw_df["List Baseline Features"] + rw_df["List Vital Signs"] + rw_df["List Lab Results"]

In [None]:
rw_df

### Find Similar/Different Features in the Papers and in our Model

In [None]:
dts = {}
feature_dts = {"Matched_Features": [], "Unmatched_Features": [], 
               "Title": [], "Type": []}
feature_types = ["List Baseline Features", "List Vital Signs", "List Lab Results", "All Features"]

# iterate through the different paper feature types 
for col in feature_types:
    print("*******" + col + "*******" )
    dts[col] = []
    # iterate through each paper
    for i in range(len(rw_df)):
        print("*******" + rw_df.iloc[i]['Title'] + "*******" )
        print("Similar Features")
        print(f'Paper Features\tModel Features\tMatching Pattern Used')
        
        dt = {"Paper Features": [], "Model Features": [], "Matching Pattern Used": [], 
              "Title": rw_df.iloc[i]['Title'], "Type": col}                
        # if paper contains valid features, check for similar/dissimilar features
        if not isinstance(rw_df[col].iloc[i], float):
            rw_features = set(rw_df[col].iloc[i])
            rw_features = {elem.strip() for elem in rw_features}
            
            # see if a partial string of each model feature is contained in each paper feature
            matched_rw_cols = []
            for model_col in model_cols:
                for rw_col in rw_features:
                    # length of partial string of each model feature set to k
                    result = k_char_match(model_col, rw_col, k=7)
                    if result:
                        matched_rw_cols.append((rw_col, model_col, result))
            matched_rw_cols_set = set([col[0] for col in matched_rw_cols])
            unmatched_rw_cols_set = set([col for col in rw_features if col not in matched_rw_cols_set])
        else:
            matched_rw_cols = [(None, None, None)]
            matched_rw_cols_set = set()
            unmatched_rw_cols_set = set()
        
        # rest of code for saving the matched/unmatched features for later viewing
        for matched_rw_col, matched_model_col, pattern in matched_rw_cols:
            print(f'{matched_rw_col}\t{matched_model_col}\t{pattern}')
            dt["Paper Features"].append(matched_rw_col)
            dt["Model Features"].append(matched_model_col)
            dt["Matching Pattern Used"].append(pattern)
        

        dts[col].append(pd.DataFrame.from_dict(dt))
        
        feature_dts["Title"].append(rw_df.iloc[i]['Title'])
        feature_dts["Type"].append(col)
        feature_dts["Unmatched_Features"].append(unmatched_rw_cols_set)
        feature_dts["Matched_Features"].append(matched_rw_cols_set)

In [None]:
feature_dts

In [None]:
feature_df = pd.DataFrame.from_dict(feature_dts)

### View Matched/Unmatched Features for All Papers (currently looking at all features)

In [None]:
feature_df.loc[feature_df['Type']=='All Features']

### More Granular Information on how Matching Features were Obtained (currently looking at first paper)

In [None]:
for col in feature_types:
    display(dts[col][0])

### Scratch

In [None]:
# similar features in the first and second paper
set(rw_df["List Baseline Features"].iloc[0]).intersection(set(rw_df["List Baseline Features"].iloc[1]))

In [None]:
rw_df["List Baseline Features"]

In [None]:
rw_df["List Vital Signs"] = rw_df["Vital Signs"].str.split(",")

In [None]:
rw_df["List Lab Results"] = rw_df["Lab Results"].str.split(",")