In [None]:
import pandas as pd
import re
import ast

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import multiprocessing

In [None]:
minimum10 = pd.read_csv('output-data/statistics/expanded_tables_annotations.csv')
minimum10

In [None]:
#Total number of columns:
minimum10['column_count'].sum()

#### For each table note which columns can be annotated by the selected labels

In [None]:
selected_to_dict = minimum10.to_dict('records')
selected = {}

In [None]:
for row in selected_to_dict:
    keys = ast.literal_eval(row['all_cols']).keys()
    selected[row['file_name']] = list(keys)

### Functions for preprocessing of text columns

In [None]:
#Load English stopwords and initialize stemmer
english_stopwords = stopwords.words('english')
stemmer = PorterStemmer()

In [None]:
#Function to return a flattened list (open nested lists)
def flatten_list(original_list):
    flat_list = []
    for item in original_list:
        if isinstance(item, list):
            flat_list = flat_list + item
        else:
            flat_list.append(item)
    return flat_list

In [None]:
#Function for lower casing, removing punctuation and special characters, remove english stopwords and apply stemming
def clean_text(text):
    #5: Lower case
    text = str(text).lower()
    
    #8: Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z0-9]+', ' ', text)
    
    #9: Remove English stopwords
    pat = r'\b(?:{})\b'.format('|'.join(english_stopwords))
    text = re.sub(pat, '', str(text))
    
    #Apply stemming
    stems = []
    
    for word in text.split():
        if word not in english_stopwords:
            stems.append(stemmer.stem(word))
    text = ' '.join(stems)
    
    #Remove excess whitespaces
    text = re.sub(' +', ' ', str(text)).strip()
    
    return text

### Tables

In [None]:
tables = minimum10['file_name'].tolist()

#Existing English Tables
existing = open("output-data/english_table_names.txt", 'r')
existing_english_tables = [line.replace('\n', '') for line in existing.readlines()]

#Import all datetime labels from schema.org
datetime = open("data/schemaorg-vocabulary/datetime_labels.txt", 'r')
date_labels = [line.replace('\n', '').lower() for line in datetime.readlines()]

In [None]:
#Returns values of numerical columns
def num_values(file_name):
    
    if file_name in existing_english_tables:
        file = 'output-data/expanded-tables/' + file_name
    else:
        file = 'output-data/new-english-tables/' + file_name
    
    #Open table
    df = pd.read_json(file, compression='gzip', lines=True)

    #Select only numerical columns
    df = df.select_dtypes(include=['number'])
    
    num_props = {}

    for column_name in df.columns:
        if (column_name in selected[file_name]) and (column_name not in date_labels and 'date' not in column_name) : #For selected columns only
            num_props[column_name] = df[df[column_name].notna()][column_name].tolist()

    return num_props

In [None]:
pool = multiprocessing.Pool(processes=30)
numerical_values = pool.map(num_values, tables)
pool.close()
pool.join()

num_tab = []
for i, val in numerical_values:
    class_ = tables[i].split('_')[0]
    for col in val:
        num_tab.append([class_, col, tables[i],val[col]])

num_df = pd.DataFrame(num_tab, columns = ['class', 'column_name', 'file_name',"value"])
num_df.to_csv('output-data/statistics/numcols.csv.gz', index=False, compression='gzip')

In [None]:
#Returns values of datetime columns
def date_values(file_name):    
    if file_name in existing_english_tables:
        file = 'output-data/expanded-tables/' + file_name
    else:
        file = 'output-data/new-english-tables/' + file_name
    
    #Open table
    df = pd.read_json(file, compression='gzip', lines=True)
    
    #Exclude numerical columns
    df = df.select_dtypes(exclude=['number'])
        
    date_props = {}

    for column_name in df.columns:
        if ':' in column_name:
            if column_name.split(':')[1] in date_labels or 'date' in column_name:
                date_props[column_name] = df[df[column_name].notna()][column_name].tolist()
        else:
            if column_name in date_labels or 'date' in column_name:
                date_props[column_name] = df[df[column_name].notna()][column_name].tolist()
                
    return date_props

In [None]:
pool = multiprocessing.Pool(processes=30)
datetime_values = pool.map(date_values, tables)
pool.close()
pool.join()

date_tab = []
for i, val in datetime_values:
    class_ = tables[i].split('_')[0]
    for col in val:
        date_tab.append([class_, col, tables[i],val[col]])

date_df = pd.DataFrame(date_tab, columns = ['class', 'column_name', 'file_name','value'])
date_df.to_csv('output-data/statistics/datecols.csv.gz', index=False, compression='gzip')
date = list(set(date_df['column_name'].tolist()))

In [None]:
#Returns values of cleaned textual columns
def text_values(file_name):
    
    if file_name in existing_english_tables:
        file = 'output-data/expanded-tables/' + file_name
    else:
        file = 'output-data/new-english-tables/' + file_name
    
    #Open table
    df = pd.read_json(file, compression='gzip', lines=True)
    
    #Exclude numerical columns
    df = df.select_dtypes(exclude=['number'])
    
    text_props = []

    for column_name in df.columns:
        if column_name not in date:
            text_props.append(column_name)
                
    return text_props

In [None]:
pool = multiprocessing.Pool(processes=30)
textual_values = pool.map(text_values, tables)
pool.close()
pool.join()

text_tab = []
for i, val in textual_values:
    class_ = tables[i].split('_')[0]
    for col in val:
        text_tab.append([class_, col, tables[i]])

text_df = pd.DataFrame(text_tab, columns = ['class', 'column_name', 'file_name'])
text_df.to_csv('output-data/statistics/textcols.csv.gz', index=False, compression='gzip')