In [28]:
import numpy as np
import tqdm
import requests
import os
import random
import json
from collections import Counter
import re
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
import math

In [10]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


NameError: name 'pipeline' is not defined

In [None]:
class ColumnAnalysis:

    def __init__(self):
        self.entity_type_dict = {
            "PERSON": "NE",
            "NORP": "NE",
            "FAC": "NE",
            "ORG": "NE",
            "GPE": "NE",
            "LOC": "NE",
            "PRODUCT": "NE",
            "EVENT": "NE",
            "WORK_OF_ART": "NE",
            "LAW": "NE",
            "LANGUAGE": "NE",
            "DATE": "LIT",
            "TIME": "LIT",
            "PERCENT": "LIT",
            "MONEY": "LIT",
            "QUANTITY": "LIT",
            "ORDINAL": "LIT",
            "CARDINAL": "LIT",
            "URL": "LIT",
            "DESC": "LIT",
            "TOKEN": "NE",
            "INTEGER": "LIT",
            "FLOAT": "LIT",
            "DATETIME": "LIT",
            "ADDRESS": "LIT",
            "EMAIL": "LIT"
        }

        self.LIT_DATATYPE = {
            "DATE": "DATETIME", 
            "TIME": "STRING", 
            "PERCENT": "STRING", 
            "MONEY": "STRING", 
            "QUANTITY": "STRING", 
            "ORDINAL": "NUMBER", 
            "CARDINAL": "NUMBER", 
            "URL": "STRING",
            "DESC": "STRING",
            "TOKEN": "STRING",
            "INTEGER": "NUMBER",
            "FLOAT": "NUMBER",
            "DATETIME": "DATETIME",
            "ADDRESS": "STRING",
            "EMAIL": "STRING",
            "STRING": "STRING"
        }

        self.NE_DATATYPE = ["PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW", "LANGUAGE"]

    def most_frequent_element(self, input_list):
        counter = Counter(input_list)
        most_common = counter.most_common(1)
        return most_common[0][0] if most_common else None

    
    def classify_columns(self, df):

        def combine_scores(j_score, ed_score, w1=0.5, w2=0.5):
            return w1 * j_score + w2 * ed_score
        
        url_pattern = re.compile(r'^(https?|ftp)://[^\s/$.?#].[^\s]*$', re.IGNORECASE)
        email_pattern = re.compile(r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$', re.IGNORECASE)
        address_pattern = re.compile(r'\d+\s+\w+\s+(?:street|st|avenue|ave|road|rd|boulevard|blvd|lane|ln|drive|dr|court|ct|circle|cir|place|pl)\.?\s*\w*', re.IGNORECASE)
        datetime_pattern = re.compile(
            r'(?:\d{4}-\d{2}-\d{2})'  # YYYY-MM-DD format
            r'|(?:31(?:\/|-|\.)0?[13578]|1[02](?:\/|-|\.)\d{4})'  # 31 days months
            r'|(?:29|30(?:\/|-|\.)0?[1,3-9]|1[0-2](?:\/|-|\.)\d{4})'  # 29/30 days months
            r'|(?:0?[1-9]|[12]\d|3[01])(?:\/|-|\.)'  # Day
            r'(?:0?[1-9]|1[0-2])(?:\/|-|\.)\d{4}'  # Month
            r'|(?:0?[1-9]|1[0-2])/(?:0?[1-9]|[12]\d|3[01])/(?:\d{2})'  # MM/DD/YY format
            r'|(?:0?[1-9]|1[0-2])/(?:0?[1-9]|[12]\d|3[01])/\d{2}'  # MM/DD/YY format
            r'\b\d{2}/(?:0?[1-9]|[12]\d|3[01])/(?:0?[1-9]|1[0-2])\b'  # YY/DD/MM format
            r'|(?:[01]?\d|2[0-3]):[0-5]\d\.[0-5]\d'  # HH:MM.SS format
            r'|(?:[01]?\d|2[0-3]):[0-5]\d'  # HH:MM format
            r'|(?:[0-5]?\d):[0-5]\d(?:\.\d{1,2})?'  # H:MM or H:MM.S format
            r'|(?:2[0-3]|[01]?\d)h[0-5]?\d(?:m[0-5]?\d(?:\.\d{1,2})?s)?',  # HhMMmSSs format
            re.IGNORECASE
        )

        col_type = []
        feature_list = []
        
        for col_name, col_data in df.items():
            type = []
            count_cell = 0
            
            
            for cell in col_data:
                label = None
                is_number = False

                try:
                    if math.isnan(cell):
                        label = "None"
                except:
                    pass
                    
                if isinstance(cell, str):
                    if cell == "NaN" or cell == "nan":
                        label = "None"
                    elif re.match(url_pattern, cell):
                        label = "URL"
                    elif re.match(email_pattern, cell):
                        label = "EMAIL"
                    elif re.match(address_pattern, cell):
                        label = "ADDRESS"
                    elif re.match(datetime_pattern, cell):
                        label = "DATETIME"
                
                if label is None:
                    try:
                        cell_str = str(cell)
                        if ',' in cell_str or '.' in cell_str or '%' in cell_str or '$' in cell_str:
                            cell_str = cell_str.replace('.', '').replace(',', '').replace('%', '').replace('$', '')
                        if len(cell_str) - len(re.findall(r'\d', cell_str)) < 5 and len(re.findall(r'\d', cell_str)) != 0:
                            is_number = True
                    except:
                        pass
                
                if is_number:
                    label = "NUMBER"
                elif label != "None" and len(cell.split(" ")) >= 15:
                    label = "NOA"
                elif label != "None" and len(cell.split(" ")) >= 1 and len(cell) <= 4:
                    label = "STRING"
                
                if label is not None:
                    type.append(label)
                    break
                else:
                    if count_cell > 5:
                        type.append(None)
                        break
                    else:
                        count_cell += 1

                        print(f"{cell} --> {nlp(cell)}")
                        
                        

          
            most_common_type = self.most_frequent_element(type)
            col_type.append(most_common_type)
            


        return col_type




In [None]:
async def process_table(column_analysis, table_path, train_df, columns):
    df = pd.read_csv(table_path)
    result = await column_analysis.classify_columns(df.iloc[1:10])

    for entry in result:
        row = {col: [entry.get(col, None)] for col in columns}
        train_df = pd.concat([train_df, pd.DataFrame(row)], ignore_index=True)

    return train_df

async def main(tables_path):
    column_analysis = ColumnAnalysis()

    columns = [
        'column_name', 'column_type', 'min_value', 'max_value', 'mean_value', 'std_dev', 'unique_count', 'special_values',
        'average_length', 'min_length', 'max_length', 'all_caps', 'capitalized', 'hyphens', 'periods', 'commas', 'common_prefixes', 'common_suffixes',
        'alphabetic_chars', 'digit_chars', 'special_chars', 'min_date', 'max_date', 'date_range', 'year_counts', 'month_counts',
        'valid_urls', 'address_count', 'valid_emails'
    ]
    
    train_df = pd.DataFrame(columns=columns)

    pattern = r'^\.'

    # Create a list of file paths, excluding files that start with a dot
    table_files = [os.path.join(tables_path, table) for table in os.listdir(tables_path) if not re.match(pattern, table)]

    for table_file in tqdm(table_files):
        train_df = await process_table(column_analysis, table_file, train_df, columns)

    return train_df

if __name__ == "__main__":
    tables_path = "./data/Dataset/Dataset/Round1_T2D/tables/"
    #tables_path = "./data/Dataset/Dataset/HardTablesR2/tables/"    
    #tables_path = "./data/Dataset/Dataset/Round3_2019/tables/"
    train_df = await (main(tables_path))