In [None]:
import numpy as np
import tqdm
import requests
import pandas as pd
import os
import numpy as np
import random
import json
from collections import Counter
import re
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
import math

In [None]:
class ColumnAnalysis:

    def __init__(self):
        self.entity_type_dict = {
            "PERSON": "NE",
            "NORP": "NE",
            "FAC": "NE",
            "ORG": "NE",
            "GPE": "NE",
            "LOC": "NE",
            "PRODUCT": "NE",
            "EVENT": "NE",
            "WORK_OF_ART": "NE",
            "LAW": "NE",
            "LANGUAGE": "NE",
            "DATE": "LIT",
            "TIME": "LIT",
            "PERCENT": "LIT",
            "MONEY": "LIT",
            "QUANTITY": "LIT",
            "ORDINAL": "LIT",
            "CARDINAL": "LIT",
            "URL": "LIT",
            "DESC": "LIT",
            "TOKEN": "NE",
            "INTEGER": "LIT",
            "FLOAT": "LIT",
            "DATETIME": "LIT",
            "ADDRESS": "LIT",
            "EMAIL": "LIT"
        }

        self.LIT_DATATYPE = {
            "DATE": "DATETIME", 
            "TIME": "STRING", 
            "PERCENT": "STRING", 
            "MONEY": "STRING", 
            "QUANTITY": "STRING", 
            "ORDINAL": "NUMBER", 
            "CARDINAL": "NUMBER", 
            "URL": "STRING",
            "DESC": "STRING",
            "TOKEN": "STRING",
            "INTEGER": "NUMBER",
            "FLOAT": "NUMBER",
            "DATETIME": "DATETIME",
            "ADDRESS": "STRING",
            "EMAIL": "STRING",
            "STRING": "STRING"
        }

        self.NE_DATATYPE = ["PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW", "LANGUAGE"]


    
    def most_frequent_element(self, input_list):
        counter = Counter(input_list)
        most_common = counter.most_common(1)
        return most_common[0][0] if most_common else None

    def extract_number_features(self, column):
        try:
            col = pd.to_numeric(column, errors='coerce')
            return {
                'min_value': np.min(col),
                'max_value': np.max(col),
                'mean_value': np.mean(col),
                'std_dev': np.std(col),
                'unique_count': len(set(col))
            }
        except Exception as e:
            print(f"Error extracting number features: {e}")
            return {}

    def extract_named_entity_features(self, column):
        lengths = [len(str(entry)) for entry in column]
        features = {
            'average_length': np.mean(lengths) if lengths else 0,
            'min_length': np.min(lengths) if lengths else 0,
            'max_length': np.max(lengths) if lengths else 0,
            'all_caps': sum(1 for entry in column if str(entry).isupper()),
            'capitalized': sum(1 for entry in column if str(entry).istitle()),
            'hyphens': sum(str(entry).count('-') for entry in column),
            'periods': sum(str(entry).count('.') for entry in column),
            'commas': sum(str(entry).count(',') for entry in column)
        }
        return features

    def extract_string_features(self, column):
        lengths = [len(str(entry)) for entry in column]
        features = {
            'average_length': np.mean(lengths) if lengths else 0,
            'min_length': np.min(lengths) if lengths else 0,
            'max_length': np.max(lengths) if lengths else 0,
            'all_caps': sum(1 for entry in column if str(entry).isupper()),
            'capitalized': sum(1 for entry in column if str(entry).istitle()),
            'alphabetic_chars': sum(char.isalpha() for entry in column for char in str(entry)),
            'digit_chars': sum(char.isdigit() for entry in column for char in str(entry)),
            'special_chars': sum(not char.isalnum() for entry in column for char in str(entry))
        }
        return features

    def extract_datetime_features(self, column):
        dates = pd.to_datetime(column, errors='coerce')
        features = {
            'min_date': dates.min(),
            'max_date': dates.max(),
            'date_range': (dates.max() - dates.min()).days,
            'year_counts': dates.dt.year.value_counts().to_dict(),
            'month_counts': dates.dt.month.value_counts().to_dict()
        }
        return features

    def classify_columns(self, df):

        def combine_scores(j_score, ed_score, w1=0.5, w2=0.5):
            return w1 * j_score + w2 * ed_score
        
        url_pattern = re.compile(r'^(https?|ftp)://[^\s/$.?#].[^\s]*$', re.IGNORECASE)
        email_pattern = re.compile(r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$', re.IGNORECASE)
        address_pattern = re.compile(r'\d+\s+\w+\s+(?:street|st|avenue|ave|road|rd|boulevard|blvd|lane|ln|drive|dr|court|ct|circle|cir|place|pl)\.?\s*\w*', re.IGNORECASE)
        datetime_pattern = re.compile(
            r'(?:\d{4}-\d{2}-\d{2})'  # YYYY-MM-DD format
            r'|(?:31(?:\/|-|\.)0?[13578]|1[02](?:\/|-|\.)\d{4})'  # 31 days months
            r'|(?:29|30(?:\/|-|\.)0?[1,3-9]|1[0-2](?:\/|-|\.)\d{4})'  # 29/30 days months
            r'|(?:0?[1-9]|[12]\d|3[01])(?:\/|-|\.)'  # Day
            r'(?:0?[1-9]|1[0-2])(?:\/|-|\.)\d{4}'  # Month
            r'|(?:0?[1-9]|1[0-2])/(?:0?[1-9]|[12]\d|3[01])/(?:\d{2})'  # MM/DD/YY format
            r'|(?:0?[1-9]|1[0-2])/(?:0?[1-9]|[12]\d|3[01])/\d{2}'  # MM/DD/YY format
            r'\b\d{2}/(?:0?[1-9]|[12]\d|3[01])/(?:0?[1-9]|1[0-2])\b'  # YY/DD/MM format
            r'|(?:[01]?\d|2[0-3]):[0-5]\d\.[0-5]\d'  # HH:MM.SS format
            r'|(?:[01]?\d|2[0-3]):[0-5]\d'  # HH:MM format
            r'|(?:[0-5]?\d):[0-5]\d(?:\.\d{1,2})?'  # H:MM or H:MM.S format
            r'|(?:2[0-3]|[01]?\d)h[0-5]?\d(?:m[0-5]?\d(?:\.\d{1,2})?s)?',  # HhMMmSSs format
            re.IGNORECASE
        )

        col_type = []
        feature_list = []
        
        for col_name, col_data in df.items():
            type = []
            count_cell = 0
            
            
            for cell in col_data:
                label = None
                is_number = False

                try:
                    if math.isnan(cell):
                        label = "None"
                except:
                    pass
                    
                if isinstance(cell, str):
                    if cell == "NaN" or cell == "nan":
                        label = "None"
                    elif re.match(url_pattern, cell):
                        label = "URL"
                    elif re.match(email_pattern, cell):
                        label = "EMAIL"
                    elif re.match(address_pattern, cell):
                        label = "ADDRESS"
                    elif re.match(datetime_pattern, cell):
                        label = "DATETIME"
                
                if label is None:
                    print("math.isnan(cell):")
                    try:
                        cell_str = str(cell)
                        if ',' in cell_str or '.' in cell_str or '%' in cell_str or '$' in cell_str:
                            cell_str = cell_str.replace('.', '').replace(',', '').replace('%', '').replace('$', '')
                        if len(cell_str) - len(re.findall(r'\d', cell_str)) < 5 and len(re.findall(r'\d', cell_str)) != 0:
                            is_number = True
                    except:
                        pass
                
                if is_number:
                    label = "NUMBER"
                elif label != "None" and len(cell.split(" ")) >= 15:
                    label = "NOA"
                elif label != "None" and len(cell.split(" ")) >= 1 and len(cell) <= 4:
                    label = "STRING"
                
                if label is not None:
                    type.append(label)
                    break
                else:
                    if count_cell > 5:
                        type.append(None)
                        break
                    else:
                        count_cell += 1
                        # do the lookup
                        url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'
                        params = {
                            'name': cell,
                            'token': 'lamapi_demo_2023',
                            'kg': 'wikidata',
                            'limit': 10,
                            'query': f'{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{cell}", "boost": 2.0}}}}}}]}}}}}}',
                            'sort': [
                                f'''{{"popularity": {{"order": "desc"}}}}'''
                            ]
                        }
    
                        response = requests.get(url, params=params)
                        if response.status_code == 200:
                            data = response.json()
                            if len(data) > 0 and data[0]['NERtype'] != None:
                                # assign the NERtype only if the weighted mean (on 50%) of the two scores is higher than 0.7
                                
                                if combine_scores(data[0]['jaccard_score'], data[0]['ed_score']) >= 0.7:
                                    #print(f"{cell} --> NE_{data[0]['NERtype']}, jaccard_score: {data[0]['jaccard_score']}, ed_score: {data[0]['ed_score']}")
                                    type.append(f"NE_{data[0]['NERtype']}")
                                    
                                    
                                    
                                #print(f"{cell} --> NE_{data[0]['NERtype']}")
                                #type.append(f"NE_{data[0]['NERtype']}")
                                
                            else:
                                type.append("STRING")

          
            most_common_type = self.most_frequent_element(type)
            col_type.append(most_common_type)
            
            if most_common_type == "NUMBER":
                features = self.extract_number_features(col_data)
            elif most_common_type in ['NE_PERS', 'NE_LOC', 'NE_ORG', 'NE_OTHERS']:
                features = self.extract_named_entity_features(col_data)
            elif most_common_type == "STRING":
                features = self.extract_string_features(col_data)
            elif most_common_type == "DATETIME":
                features = self.extract_datetime_features(col_data)
            else:
                features = {}

            features['column_name'] = col_name
            features['column_type'] = most_common_type
            feature_list.append(features)


        return feature_list


