In [None]:
import multiprocessing
import pandas as pd
import re

In [None]:
tables = pd.read_csv('output-data/statistics/expanded_tables_annotations_cpa.csv')
tables_70 = tables.loc[ (tables['overall_table_density'] >= 70) ]
tables_70

In [None]:
tables = tables_70['file_name'].tolist()
text = pd.read_csv('output-data/statistics/textcols.csv.gz', compression='gzip')
text = text.loc[text['file_name'].isin(tables)]
text

In [None]:
num = pd.read_csv('output-data/statistics/numcols.csv.gz', compression='gzip')
num = num.loc[num['file_name'].isin(tables)]
num

In [None]:
selection = pd.read_csv('output-data/cpa-datasets/selected_1.csv')
selection_to_dict = selection.to_dict('records')
selected_cols = {}
for row in selection_to_dict:
    selected_cols[row['file_name']] = eval(row['selected_cols'])

In [None]:
def num_perc(index):
    text = str(v[index]).lower()
    text = re.sub(r"[^\w\s]", "", text)
    total = len(text)
    
    text = re.sub(r"[^\x00-\x7F]+", "", str(text))
    text = re.sub(' +', '', str(text)).strip()
    
    num = re.sub(r"[a-z]", "", text)
    num = re.sub(' +', '', str(num)).strip()
    
    if(total == 0):
        return 0
    else:
        per = len(num) / total
        return per

In [None]:
#Existing English Tables
existing = open("output-data/english_table_names.txt", 'r')
existing_english_tables = [line.replace('\n', '') for line in existing.readlines()]
len(existing_english_tables)

In [None]:
#Returns values of cleaned textual columns
def get_values(file_name):
    
    if file_name in existing_english_tables:
        file = 'output-data/expanded-tables/' + file_name
    else:
        file = 'output-data/new-english-tables/' + file_name
    
    #Open table
    df = pd.read_json(file, compression='gzip', lines=True)
    
    text_props = {}

    for column_name in df.columns:
        if column_name == col_name:
            text_props[column_name] = df[df[column_name].notna()][column_name].tolist()
                
    return text_props

### priceRange

In [None]:
#Locate all pricerange columns and remove already selected ones
col_name = 'pricerange'
pricerange = text.loc[(text['column_name'] == 'pricerange') & (text['file_name'].isin(tables)) ]['file_name'].tolist()
for tab in selected_cols:
    if 'pricerange' in selected_cols[tab] and tab in pricerange:
        pricerange.remove(tab)

In [None]:
#Get all values of these columns
pool = multiprocessing.Pool(processes=30)
values = pool.map(get_values, pricerange)
pool.close()
pool.join()

#and put them in a dataframe
pricerange_list = []
i = 0
for val in values:
    class_ = pricerange[i].split('_')[0]
    
    for col in val:
        pricerange_list.append([class_, col, pricerange[i], val[col]])
    i += 1

pricer = pd.DataFrame( pricerange_list, columns=['class', 'column_name', 'file_name', 'value'] )

In [None]:
#Calculate numerical percentage of values in a column
v = pricer['value'].tolist()

pool = multiprocessing.Pool(processes=21)
res = pool.map(num_perc, range(len(pricer)))
pool.close()
pool.join()

pricer['num_percentage'] = res

In [None]:
#Devide the columns into three categories based on the numerical percentage of the column values
def cat(row):
    if row <= 0.3:
        return 1
    elif row > 0.3 and row < 0.8:
        return 2
    else:
        return 3
    
pricer['num_category'] = pricer['num_percentage'].apply(lambda row: cat(row))

In [None]:
#From each category sample 1700/3 = 570 columns
cat_1 = pricer.loc[pricer['num_category'] == 1]['file_name'].tolist()[:570]
cat_2 = pricer.loc[pricer['num_category'] == 2]['file_name'].tolist()[:570]
cat_3 = pricer.loc[pricer['num_category'] == 3]['file_name'].tolist()[:570]

for cat in cat_1:
    if cat not in selected_cols:
        selected_cols[cat] = {}
    selected_cols[cat]['pricerange'] = 'Value Heterogeneity'
    
for cat in cat_2:
    if cat not in selected_cols:
        selected_cols[cat] = {}
    selected_cols[cat]['pricerange'] = 'Value Heterogeneity'
    
for cat in cat_3:
    if cat not in selected_cols:
        selected_cols[cat] = {}
    selected_cols[cat]['pricerange'] = 'Value Heterogeneity'

### offers:price

In [None]:
#Locate all offers:price columns and remove already selected ones
col_name = 'offers:price'
offerprice = text.loc[(text['column_name'] == 'offers:price') & (text['file_name'].isin(tables)) ]['file_name'].tolist()
for tab in selected_cols:
    if 'offers:price' in selected_cols[tab] and tab in offerprice:
        offerprice.remove(tab)

In [None]:
#Get all values of these columns
pool = multiprocessing.Pool(processes=30)
values = pool.map(get_values, offerprice)
pool.close()
pool.join()

#and put them in a dataframe
offerprice_list = []
i = 0
for val in values:
    class_ = offerprice[i].split('_')[0]
    
    for col in val:
        offerprice_list.append([class_, col, offerprice[i], val[col]])
    i += 1

priceoffer = pd.DataFrame( offerprice_list, columns=['class', 'column_name', 'file_name', 'value'] )

In [None]:
#Calculate numerical percentage of values in a column
v = priceoffer['value'].tolist()

pool = multiprocessing.Pool(processes=21)
res = pool.map(num_perc, range(len(priceoffer)))
pool.close()
pool.join()

priceoffer['num_percentage'] = res

In [None]:
off = priceoffer.loc[(priceoffer['num_percentage'] < 0.5) & (priceoffer['num_percentage'] > 0.1) ][:800]['file_name'].tolist()

for tab in off:
    if tab not in selected_cols:
        selected_cols[tab] = {}
    if 'offers:price' not in selected_cols[tab]:
        selected_cols[tab]['offers:price'] = 'Value Heterogeneity'

### telephone

In [None]:
#Locate all telephone columns and remove already selected ones
col_name = 'telephone'
tel = text.loc[(text['column_name'] == 'telephone') & (text['file_name'].isin(tables)) ]['file_name'].tolist()
for tab in selected_cols:
    if 'telephone' in selected_cols[tab] and tab in tel:
        tel.remove(tab)

In [None]:
#Get all values of these columns
pool = multiprocessing.Pool(processes=30)
values = pool.map(get_values, tel)
pool.close()
pool.join()

#and put them in a dataframe
telephone_list = []
i = 0
for val in values:
    class_ = tel[i].split('_')[0]
    
    for col in val:
        telephone_list.append([class_, col, tel[i], val[col]])
    i += 1
    
telephone = pd.DataFrame( telephone_list, columns=['class', 'column_name', 'file_name', 'value'] )

In [None]:
#Calculate numerical percentage of values in a column
v = telephone['value'].tolist()

pool = multiprocessing.Pool(processes=21)
res = pool.map(num_perc, range(len(telephone)))
pool.close()
pool.join()

telephone['num_percentage'] = res

In [None]:
#Select columns from 2 categories
tel_1 = telephone.loc[(telephone['num_percentage'] > 0.5) & (telephone['num_percentage'] < 0.7)][:800]['file_name'].tolist()
tel_2 = telephone.loc[(telephone['num_percentage'] > 0.8)][:800]['file_name'].tolist()

for tab in tel_1:
    if tab not in selected_cols:
        selected_cols[tab] = {}
    if 'telephone' not in selected_cols[tab]:
        selected_cols[tab]['telephone'] = 'Value Heterogeneity'
        
        
for tab in tel_2:
    if tab not in selected_cols:
        selected_cols[tab] = {}
    if 'telephone' not in selected_cols[tab]:
        selected_cols[tab]['telephone'] = 'Value Heterogeneity'

### duration

In [None]:
#Locate all duration columns and remove already selected ones
col_name = 'duration'
dur = text.loc[(text['column_name'] == 'duration') & (text['file_name'].isin(tables)) ]['file_name'].tolist()
for tab in selected_cols:
    if 'duration' in selected_cols[tab] and tab in dur:
        dur.remove(tab)

In [None]:
#Get all values of these columns
pool = multiprocessing.Pool(processes=30)
values = pool.map(get_values, dur)
pool.close()
pool.join()

#and put them in a dataframe
duration_list = []
i = 0
for val in values:
    class_ = dur[i].split('_')[0]
    
    for col in val:
        duration_list.append([class_, col, dur[i], val[col]])
    i += 1
    
duration = pd.DataFrame( duration_list, columns=['class', 'column_name', 'file_name', 'value'] )

In [None]:
#Group columns using different metrics for duration
def cat_duration(row):
    if 'min' in row:
        if 'sec' in row:
            return 1
        else:
            return 2
    elif 'PT' in row:
        return 3
    elif 'PD' in row:
        return 4
    elif 'Min.' in row:
        return 5
    elif ':' in row:
        return 6
    elif re.search('[0-9]+h [0-9]+m', row):
        return 7
    elif re.search('[0-9]+m [0-9]+s', row):
         return 8
    else:
        return None
    
duration['category'] = duration['value'].apply(lambda row: cat_duration(str(row)))

In [None]:
#From each category sample 1700/3 = 570 columns
cat_1 = duration.loc[duration['category'] == 1]['file_name'].tolist()[2:102]
cat_2 = duration.loc[duration['category'] == 2]['file_name'].tolist()[:100]
cat_3 = duration.loc[duration['category'] == 3]['file_name'].tolist()[:100]
cat_4 = duration.loc[duration['category'] == 4]['file_name'].tolist()[:100]
cat_5 = duration.loc[duration['category'] == 5]['file_name'].tolist()[:100]
cat_6 = duration.loc[duration['category'] == 6]['file_name'].tolist()[:100]
cat_7 = duration.loc[duration['category'] == 7]['file_name'].tolist()[:100]
cat_8 = duration.loc[duration['category'] == 8]['file_name'].tolist()[:100]

all_duration = cat_1 + cat_2 + cat_3 + cat_4 + cat_5 + cat_6 + cat_7 + cat_8

for cat in all_duration:
    if cat not in selected_cols:
        selected_cols[cat] = {}
    if 'duration' not in selected_cols[cat]:
        selected_cols[cat]['duration'] = 'Value Heterogeneity'

### weight

In [None]:
#Locate all weight columns and remove already selected ones
col_name = 'weight'
w = text.loc[(text['column_name'] == 'weight') & (text['file_name'].isin(tables)) ]['file_name'].tolist()
for tab in selected_cols:
    if 'weight' in selected_cols[tab] and tab in w:
        w.remove(tab)

In [None]:
#Get all values of these columns
pool = multiprocessing.Pool(processes=30)
values = pool.map(get_values, w)
pool.close()
pool.join()

#and put them in a dataframe
weight_list = []
i = 0
for val in values:
    class_ = w[i].split('_')[0]
    
    for col in val:
        weight_list.append([class_, col, w[i], val[col]])   
    i += 1
    
weight = pd.DataFrame( weight_list, columns=['class', 'column_name', 'file_name', 'value'] )

In [None]:
#Group weight columns under different categories based on measure metrics used
def weight_cat(row):
    if 'kgs' in row:
        return 1
    elif 'kg' in row:
        return 2
    elif 'lbs' in row:
        return 3
    elif 'lb' in row:
        return 4
    elif 'ounces' in row:
        return 5
    elif 'oz' in row:
        return 6
    elif 'grams' in row:
        return 7
    elif re.search('[0-9]+ g', row):
        return 8
    else:
        return None
    
weight['category'] = weight['value'].apply(lambda row: weight_cat(str(row).lower()))

In [None]:
#Select 100 columns from each category
cat_1 = weight.loc[weight['category'] == 1]['file_name'].tolist()[:100]
cat_2 = weight.loc[weight['category'] == 2]['file_name'].tolist()[:100]
cat_3 = weight.loc[weight['category'] == 3]['file_name'].tolist()[:100]
cat_4 = weight.loc[weight['category'] == 4]['file_name'].tolist()[:100]
cat_5 = weight.loc[weight['category'] == 5]['file_name'].tolist()[:100]
cat_6 = weight.loc[weight['category'] == 6]['file_name'].tolist()[:100]
cat_7 = weight.loc[weight['category'] == 7]['file_name'].tolist()[:100]
cat_8 = weight.loc[weight['category'] == 8]['file_name'].tolist()[:100]

all_weight = cat_1 + cat_2 + cat_3 + cat_4 + cat_5 + cat_6 + cat_7 + cat_8

for cat in all_weight:
    if cat not in selected_cols:
        selected_cols[cat] = {}
    if 'weight' not in selected_cols[cat]:
        selected_cols[cat]['weight'] = 'Value Heterogeneity'

### height

In [None]:
#Locate all height columns and remove already selected ones
col_name = 'height'
h = text.loc[(text['column_name'] == 'height') & (text['file_name'].isin(tables)) ]['file_name'].tolist()
for tab in selected_cols:
    if 'height' in selected_cols[tab] and tab in h:
        h.remove(tab)

In [None]:
#Get all values of these columns
pool = multiprocessing.Pool(processes=30)
values = pool.map(get_values, h)
pool.close()
pool.join()

#and put them in a dataframe
height_list = []
i = 0
for val in values:
    class_ = h[i].split('_')[0]
    
    for col in val:
        height_list.append([class_, col, h[i], val[col]])
        
    i += 1
    
height = pd.DataFrame( height_list, columns=['class', 'column_name', 'file_name', 'value'] )

In [None]:
#Group height columns under different categories based on measure metrics used
def height_cat(row):
    if 'undefined' in row:
        return None
    elif 'inches' in row:
        return 1
    elif 'in' in row:
        return 6
    elif 'cm' in row:
        return 2
    elif 'mm' in row:
        return 3
    elif 'm' in row:
        return 4
    elif re.search('[0-9]+\'[0-9]+', row):
        return 5
    else:
        return None
    
height['category'] = height['value'].apply(lambda row: height_cat(str(row).lower()))

In [None]:
#Select 100 columns from each category

cat_1 = height.loc[height['category'] == 1]['file_name'].tolist()[:100]
cat_2 = height.loc[height['category'] == 2]['file_name'].tolist()[:100]
cat_3 = height.loc[height['category'] == 3]['file_name'].tolist()[:100]
cat_4 = height.loc[height['category'] == 4]['file_name'].tolist()[:100]
cat_5 = height.loc[height['category'] == 5]['file_name'].tolist()[:100]
cat_6 = height.loc[height['category'] == 6]['file_name'].tolist()[:100]

all_height = cat_1 + cat_2 + cat_3 + cat_4 + cat_5 + cat_6

for cat in all_height:
    if cat not in selected_cols:
        selected_cols[cat] = {}
    if 'height' not in selected_cols[cat]:
        selected_cols[cat]['height'] = 'Value Heterogeneity'

### width

In [None]:
#Locate all width columns and remove already selected ones
col_name = 'width'
wid = text.loc[(text['column_name'] == 'width') & (text['file_name'].isin(tables)) ]['file_name'].tolist()
for tab in selected_cols:
    if 'width' in selected_cols[tab] and tab in wid:
        wid.remove(tab)

In [None]:
#Get all values of these columns
pool = multiprocessing.Pool(processes=30)
values = pool.map(get_values, wid)
pool.close()
pool.join()

#and put them in a dataframe
width_list = []
i = 0
for val in values:
    class_ = wid[i].split('_')[0]
    
    for col in val:
        width_list.append([class_, col, wid[i], val[col]])
        
    i += 1
    
width = pd.DataFrame( width_list, columns=['class', 'column_name', 'file_name', 'value'] )

In [None]:
#Group width columns under different categories based on measure metrics used
width['category'] = width['value'].apply(lambda row: height_cat(str(row).lower()))

In [None]:
#Select 100 columns from each category

cat_1 = width.loc[width['category'] == 1]['file_name'].tolist()[:100]
cat_2 = width.loc[width['category'] == 2]['file_name'].tolist()[:100]
cat_3 = width.loc[width['category'] == 3]['file_name'].tolist()[:100]
cat_4 = width.loc[width['category'] == 4]['file_name'].tolist()[:100]
cat_6 = width.loc[width['category'] == 6]['file_name'].tolist()[:100]

all_width = cat_1 + cat_2 + cat_3 + cat_4 + cat_6

for cat in all_width:
    if cat not in selected_cols:
        selected_cols[cat] = {}
    if 'width' not in selected_cols[cat]:
        selected_cols[cat]['width'] = 'Value Heterogeneity'

### faxNumber

In [None]:
#Locate all faxnumber columns and remove already selected ones
col_name = 'faxnumber'
fax = text.loc[(text['column_name'] == 'faxnumber') & (text['file_name'].isin(tables)) ]['file_name'].tolist()
for tab in selected_cols:
    if 'faxnumber' in selected_cols[tab] and tab in fax:
        fax.remove(tab)

In [None]:
#Get all values of these columns
pool = multiprocessing.Pool(processes=30)
values = pool.map(get_values, fax)
pool.close()
pool.join()

#and put them in a dataframe
faxnumber_list = []
i = 0
for val in values:
    class_ = fax[i].split('_')[0]
    
    for col in val:
        faxnumber_list.append([class_, col, fax[i], val[col]])
        
    i += 1
    
faxnumber = pd.DataFrame( faxnumber_list, columns=['class', 'column_name', 'file_name', 'value'] )

In [None]:
faxnumber

In [None]:
#Calculate numerical percentage of values in a column
v = faxnumber['value'].tolist()

pool = multiprocessing.Pool(processes=21)
res = pool.map(num_perc, range(len(v)))
pool.close()
pool.join()

faxnumber['num_percentage'] = res

In [None]:
#Select 800 columns from 2 categories
fax_1 = faxnumber.loc[(faxnumber['num_percentage'] > 0.5) & (faxnumber['num_percentage'] < 0.7)][:800]['file_name'].tolist()
fax_2 = faxnumber.loc[(faxnumber['num_percentage'] > 0.8)][:800]['file_name'].tolist()

for tab in fax_1:
    if tab not in selected_cols:
        selected_cols[tab] = {}
    if 'faxnumber' not in selected_cols[tab]:
        selected_cols[tab]['faxnumber'] = 'Value Heterogeneity'
        
        
for tab in fax_2:
    if tab not in selected_cols:
        selected_cols[tab] = {}
    if 'faxnumber' not in selected_cols[tab]:
        selected_cols[tab]['faxnumber'] = 'Value Heterogeneity'

### servingSize

In [None]:
#Locate all nutrition:servingsize columns and remove already selected ones
col_name = 'nutrition:servingsize'
ser = text.loc[(text['column_name'] == 'nutrition:servingsize') & (text['file_name'].isin(tables)) ]['file_name'].tolist()
for tab in selected_cols:
    if 'nutrition:servingsize' in selected_cols[tab] and tab in ser:
        ser.remove(tab)

In [None]:
#Get all values of these columns
pool = multiprocessing.Pool(processes=30)
values = pool.map(get_values, ser)
pool.close()
pool.join()

#and put them in a dataframe
serving_list = []
i = 0
for val in values:
    class_ = ser[i].split('_')[0]
    
    for col in val:
        serving_list.append([class_, col, ser[i], val[col]])
        
    i += 1
    
    
serving = pd.DataFrame( serving_list, columns=['class', 'column_name', 'file_name', 'value'] )

In [None]:
def serving_cat(row):
    if 'cup' in row:
        return 1
    elif 'serving' in row:
        return 2
    elif 'ounce' in row:
        return 3
    elif 'slice' in row:
        return 4
    elif 'oz' in row:
        return 5
    elif 'portion' in row:
        return 6
    elif 'gram' in row:
        return 7
    elif 'mg' in row:
        return 8
    else:
        return None
    
serving['category'] = serving['value'].apply(lambda row: serving_cat(str(row).lower()))

In [None]:
cat_1 = serving.loc[serving['category'] == 1]['file_name'].tolist()[:100]
cat_2 = serving.loc[serving['category'] == 2]['file_name'].tolist()[:100]
cat_3 = serving.loc[serving['category'] == 3]['file_name'].tolist()[:100]
cat_4 = serving.loc[serving['category'] == 4]['file_name'].tolist()[:100]
cat_5 = serving.loc[serving['category'] == 5]['file_name'].tolist()[:100]
cat_6 = serving.loc[serving['category'] == 6]['file_name'].tolist()[:100]
cat_7 = serving.loc[serving['category'] == 7]['file_name'].tolist()[:100]
cat_8 = serving.loc[serving['category'] == 8]['file_name'].tolist()[:100]

all_serving = cat_1 + cat_2 + cat_3 + cat_4 + cat_5 + cat_6 + cat_7 + cat_8

for cat in all_serving:
    if cat not in selected_cols:
        selected_cols[cat] = {}
    if 'nutrition:servingsize' not in selected_cols[cat]:
        selected_cols[cat]['nutrition:servingsize'] = 'Value Heterogeneity'

## content

In [None]:
#Locate all content columns and remove already selected ones
col_names = ['nutrition:fatcontent', 'nutrition:carbohydratecontent', 'nutrition:proteincontent', 'nutrition:sodiumcontent', 'nutrition:sugarcontent', 'nutrition:saturatedfatcontent', 'nutrition:fibercontent', 'nutrition:cholesterolcontent', 'nutrition:transfatcontent', 'nutrition:unsaturatedfatcontent']
con = set()
for col in col_names:
    for t in text.loc[(text['column_name'] == col) & (text['file_name'].isin(tables)) ]['file_name'].tolist():
        con.add(t)
for tab in selected_cols:
    for col in col_names:
        if col in selected_cols[tab] and tab in con:
            con.remove(tab)

In [None]:
#Get all values of these columns
fatcontent = pd.DataFrame(columns=['class', 'column_name', 'file_name', 'value'])
for col in col_names:
    
    con = []
    for t in text.loc[(text['column_name'] == col) & (text['file_name'].isin(tables)) ]['file_name'].tolist():
        con.append(t)
        
    for tab in selected_cols:
        if col in selected_cols[tab] and tab in con:
            con.remove(tab)
    
    col_name = col
    pool = multiprocessing.Pool(processes=30)
    values = pool.map(get_values, con)
    pool.close()
    pool.join()
    
    sub = []
    i = 0
    for val in values:
        class_ = con[i].split('_')[0]

        for col in val:
            sub.append([class_, col, con[i], val[col]])

        i += 1
        
    c = pd.DataFrame( sub, columns=['class', 'column_name', 'file_name', 'value'] )
    fatcontent = pd.concat([fatcontent, c])

In [None]:
fatcontent['category'] = fatcontent['value'].apply(lambda row: weight_cat(str(row).lower()))

In [None]:
for col in col_names:
    cat_7 = fatcontent.loc[(fatcontent['category'] == 7) & (fatcontent['column_name'] == col )]['file_name'].tolist()[:200]
    cat_8 = fatcontent.loc[(fatcontent['category'] == 8) & (fatcontent['column_name'] == col )]['file_name'].tolist()[:200]
    
    for cat in cat_7+cat_8:
        if cat not in selected_cols:
            selected_cols[cat] = {}
        if col not in selected_cols[cat]:
            selected_cols[cat][col] = 'Value Heterogeneity'

### Date columns

In [None]:
cpa_statistics = pd.read_csv('output-data/statistics/cpa_statistics.csv')
cpa_statistics = cpa_statistics.loc[cpa_statistics['column_count'] >= 50]
cpa_statistics

In [None]:
rels = pd.read_csv('data/Final CTA and CPA Labels.csv')
rels = rels.loc[rels['CPA label'].isin(cpa_statistics['cpa_label'].tolist())]
rels

In [None]:
#CPA labels: column name to its CPA label
rel_lbls = {}
for index, row in rels.iterrows():
    rel_lbls[row['column_name']] = row['CPA label']

In [None]:
date = pd.read_csv('output-data/statistics/datecols.csv.gz', compression='gzip')
date = date.loc[(date['file_name'].isin(tables)) & (date['column_name'].isin(rel_lbls))]
date

In [None]:
format_1 = '^(Aug|Jan|Feb|Mar|May|Apr|June|July|Sep|Oct|Nov|Dec)+ [0-9]*, [0-9]{4}$'
format_2 = '^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}.[0-9]{3}Z$'
format_3 = '^[0-9]{4}-[0-9]{2}-[0-9]{2}$'
format_4 = '^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\+[0-9]{2}:[0-9]{2}$'
format_5 = '^(January|February|March|April|May|June|July|August|September|October|November|December)+ [0-9]{1,}, [0-9]{4}$'
format_6 = '^(Aug|Jan|Feb|Mar|May|Apr|June|July|Sep|Oct|Nov|Dec)+\. [0-9]{2}, [0-9]{4}, [0-9]{1,2}:[0-9]{2} (a|p)+\.m\.$'
format_7 = '^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{6}$'
format_8 = '^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}$'
format_9 = '^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}(\+|\-)+[0-9]{2}:[0-9]{2}$'
format_10 = '^[0-9]{2}\/[0-9]{2}\/[0-9]{4}$'
format_11 = '^[0-9]{2}\/[0-9]{2}\/[0-9]{4} [0-9]{1,2}:[0-9]{2}:[0-9]{2} (am|pm|AM|PM)$'
format_12 = '^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{1,2}:[0-9]{2}:[0-9]{2} [a-zA-Z]{3}$'

In [None]:
#Group all date columns under different categories based on the formats
different_date_formats = {1:[], 2:[], 3:[], 4:[], 5:[], 6:[], 7:[], 8:[], 9:[], 10:[], 11:[], 12:[]}

for index, row in date.iterrows():
    val = eval(row['value'])
    
    #print(val[0])
    
    if isinstance(val[0], str):
        check = val[0]
    elif isinstance(val[0], dict):
        if 'date' in val[0]:
            check = val[0]['date']
        elif 'datepublished' in val[0]:
            check = val[0]['datepublished']
        else:
            check = val[0]['date']
            #print(val[0])
    elif isinstance(val[0], list):
        check = val[0][0]
        
    #print(check)
    if isinstance(check,list):
        check = check[0]
    elif pd.isnull(check):
        check = ''        
                
    if re.match(format_1, check):
        different_date_formats[1].append([row['class'], row['column_name'], row['file_name'], row['value']])
    elif re.match(format_2, check):
        different_date_formats[2].append([row['class'], row['column_name'], row['file_name'], row['value']])
    elif re.match(format_3, check):
        different_date_formats[3].append([row['class'], row['column_name'], row['file_name'], row['value']])
    elif re.match(format_4, check):
        different_date_formats[4].append([row['class'], row['column_name'], row['file_name'], row['value']])
    elif re.match(format_5, check):
        different_date_formats[5].append([row['class'], row['column_name'], row['file_name'], row['value']])
    elif re.match(format_6, check):
        different_date_formats[6].append([row['class'], row['column_name'], row['file_name'], row['value']])    
    elif re.match(format_7, check):
        different_date_formats[7].append([row['class'], row['column_name'], row['file_name'], row['value']])  
    elif re.match(format_8, check):
        different_date_formats[8].append([row['class'], row['column_name'], row['file_name'], row['value']])
    elif re.match(format_9, check):
        different_date_formats[9].append([row['class'], row['column_name'], row['file_name'], row['value']])
    elif re.match(format_10, check):
        different_date_formats[10].append([row['class'], row['column_name'], row['file_name'], row['value']])
    elif re.match(format_11, check):
        different_date_formats[11].append([row['class'], row['column_name'], row['file_name'], row['value']])
    elif re.match(format_12, check):
        different_date_formats[12].append([row['class'], row['column_name'], row['file_name'], row['value']])


In [None]:
#Choose 400 columns from each category
for date_format in different_date_formats:
    df = pd.DataFrame(different_date_formats[date_format], columns=['class', 'column_name', 'file_name', 'value'])
    
    cols = list(df['column_name'].unique())
    
    for col in cols:
        some_cols = df.loc[df['column_name'] == col ]['file_name'].tolist()[:400]
        
        for c in some_cols:
            if c not in selected_cols:
                selected_cols[c] = {}
            if col not in selected_cols[c]:
                selected_cols[c][col] = 'Value Heterogeneity'

In [None]:
tables = pd.read_csv('output-data/statistics/expanded_tables_annotations_cpa.csv')
remove_tabs = []
for tab in selected_cols:
    if not selected_cols[tab]:
        remove_tabs.append(tab)

for tab in remove_tabs:
    del selected_cols[tab]

In [None]:
selection = tables.loc[ tables['file_name'].isin(selected_cols) ]
sel_cols = []
for index, row in selection.iterrows():
    sel_cols.append(selected_cols[row['file_name']])
selection['selected_cols'] = sel_cols
selection.to_csv('output-data/cpa-datasets/selected_1_2.csv', index=False)