In [None]:
import pandas as pd
import zipfile
import os
import re
import fasttext
import multiprocessing
import string

In [None]:
#Load fasttext model for language detection
model = fasttext.load_model('data/fasttext-model/lid.176.bin')

## Language Detection for Column Similarity
1. Discard columns that do not indicate the language.
2. Discard numerical and datetime columns.
3. Remove numbers in all column values.
4. Remove punctuation
5. Detect language using fasttext model and create new tables with only English rows if a table is not in English.

In [None]:
#1: Columns to discard as they are not indicative of the language
ignore_columns = ['url','telephone','page_url', 'photo', 'image', 'name', 'sku', 'identifier', 'isbn', 'mpn', 'productid', 'gtin', 'vatid', 'taxid', 'faxnumber', 'geo', 'price', 'openinghoursspecification']

#Import all datetime labels from schema.org
datetime = open("data/schemaorg-vocabulary/datetime_labels.txt", 'r')
date_labels = [line.replace('\n', '').lower() for line in datetime.readlines()]

ignore_columns = ignore_columns + date_labels

In [None]:
#Preprocessing methods:

def clean_text(text):
    
    if pd.isnull(text):
        return ''
    
    #3: Remove numbers
    text = re.sub(r"[0-9]", "", str(text))
        
    #4: Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    #Remove excess whitespaces
    text = re.sub(' +', ' ', str(text)).strip()
    
    return text

#Return a flattened list
def flatten_list(original_list):
    flat_list = []
    for item in original_list:
        if isinstance(item, list):
            flat_list = flat_list + item
        else:
            flat_list.append(item)
    return flat_list

#Preprocess rows
def preprocess_row(row):
    preprocessed_row = row
    
    #If the row is a dictionary, turn dictionary into list by removing keys
    if(isinstance(row, dict)):
        preprocessed_row = ' '.join(flatten_list([preprocess_row(v) for k, v in row.items()]))
    
    #If row is a list, check if any value in the list is a dictionary
    if(isinstance(preprocessed_row, list)):
        preprocessed_row = ' '.join(flatten_list([preprocess_row(item) for item in preprocessed_row]))
    #Else if row is a string
    else:
        preprocessed_row = clean_text(row)
    
    return preprocessed_row

In [None]:
def check_rows_language(column_name, df, file_name):
    #Dataframe to dictionary
    df_to_dict = df.to_dict('records')

    table = []
    numdate_columns = list(df.select_dtypes(include=['number','datetime']).columns)
    
    #Check language of each row
    for row in df_to_dict:
        #If column value is None: text is all row, otherwise only column value
        text = preprocess_row(row[column_name]) if column_name and row[column_name] else ' '.join([ clean_text(preprocess_row(row[col])) for col in df.columns if row[col] and col not in ignore_columns + numdate_columns])
        row_is_english = False

        if(clean_text(text)):
            #Predict language of value/row
            language = model.predict(clean_text(text))
            confidence = language[1][0]
            
            #Row is considered English if:
            # 1. The value of text is empty
            # 2. English prediction with high confidence
            
            row_is_english = bool(language[0][0] == '__label__en') & bool(confidence >= 0.5)

        #Keep empty row values (could be due to removed numbers)        
        else:
            row_is_english = True

        #If row is in English, add to new table
        if row_is_english:
            r = [row[col] for col in df.columns]
            table.append(r)
    
    #If new table is the same length as original table, keep original table
    if len(table) == len(df.index):
        return True
    
    #If new table has at least 10 rows
    if len(table) >= 10:
        new_table = pd.DataFrame(table, columns=df.columns)
        new_table.to_json(english_tables_path + file_name, orient='records', lines=True, compression='gzip')
    
    #Returns False because not all rows are in English
    return False

In [None]:
#Method for when a column name (description or disambiguatingDescription) exists:
#Check if all rows of the column for that column are in English
def is_english_table_with_column(column_name, df, file_name):
    
    #Predict language for non-empty values of column
    col_values = df[df[column_name].notna()][column_name].apply(lambda row: preprocess_row(row)).tolist()
    
    language = model.predict(flatten_list(col_values))
    language_list = [item for sublist in language[0] for item in sublist]
        
    #If the table has all rows in a single language and with high confidence return true or false if english table
    if len(set(language_list)) == 1 and all(conf >= 0.50 for conf in language[1]):
        return bool(next(iter(set(language_list))) == '__label__en')
    
    #Else if the table has different languages in each row: Create a new table with rows in English
    else:
        check_rows_language(column_name, df, file_name)
        

## Detect English Tables

In [None]:
english_tables_path = 'output-data/new-english-tables/'

In [None]:
def find_english_tables(file_name):
    
    file = 'output-data/expanded-tables/' + file_name
    #Open table as dataframe
    df = pd.read_json(file, compression='gzip', lines=True)
    
    table_is_english = False

    #Check if there is a description column which could indicate the language of the table
    if('description' in df.columns):
        table_is_english = is_english_table_with_column('description', df, file_name)

    #Else check if there is a disambiguatingDescription column
    elif('disambiguatingdescription' in df.columns):
        table_is_english = is_english_table_with_column('disambiguatingdescription', df, file_name)
    
    #If not check all rows individually if they are in English, and create a table that has only these English rows
    else:
        table_is_english = check_rows_language( None, df, file_name)
        
    if table_is_english:
        with open('output-data/english_table_names.txt', 'a') as file:
            file.write(file_name+'\n')


In [None]:
#All file_names
table_names = os.listdir('output-data/expanded-tables/')
len(table_names)

pool = multiprocessing.Pool(processes=30)
res = pool.map(find_english_tables, table_names)
pool.close()