In [1]:
! pip install GLiNER

Collecting GLiNER
  Downloading gliner-0.2.13-py3-none-any.whl.metadata (7.3 kB)
Collecting torch>=2.0.0 (from GLiNER)
  Downloading torch-2.4.1-cp311-cp311-manylinux1_x86_64.whl.metadata (26 kB)
Collecting transformers>=4.38.2 (from GLiNER)
  Downloading transformers-4.45.1-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.21.4 (from GLiNER)
  Downloading huggingface_hub-0.25.1-py3-none-any.whl.metadata (13 kB)
Collecting onnxruntime (from GLiNER)
  Downloading onnxruntime-1.19.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting sentencepiece (from GLiNER)
  Downloading sentencepiece-0.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting filelock (from huggingface-hub>=0.21.4->GLiNER)
  Downloading filelock-3.16.1-py3-none-any.whl.metadata (2.9 kB)
Collecting nvidia-cuda

In [2]:
import pandas as pd
import numpy as np
import re
import math
import asyncio
import aiohttp
from collections import Counter

# GLiNER related imports (assuming 'gliner' is a valid package)
from gliner import GLiNER


In [None]:
class ColumnAnalysis:

    def __init__(self):
        self.entity_type_dict = {
            "PERSON": "NE",
            "NORP": "NE",
            "FAC": "NE",
            "ORG": "NE",
            "GPE": "NE",
            "LOC": "NE",
            "PRODUCT": "NE",
            "EVENT": "NE",
            "WORK_OF_ART": "NE",
            "LAW": "NE",
            "LANGUAGE": "NE",
            "DATE": "LIT",
            "TIME": "LIT",
            "PERCENT": "LIT",
            "MONEY": "LIT",
            "QUANTITY": "LIT",
            "ORDINAL": "LIT",
            "CARDINAL": "LIT",
            "URL": "LIT",
            "DESC": "LIT",
            "TOKEN": "NE",
            "INTEGER": "LIT",
            "FLOAT": "LIT",
            "DATETIME": "LIT",
            "ADDRESS": "LIT",
            "EMAIL": "LIT"
        }

        self.LIT_DATATYPE = {
            "DATE": "DATETIME", 
            "TIME": "STRING", 
            "PERCENT": "STRING", 
            "MONEY": "STRING", 
            "QUANTITY": "STRING", 
            "ORDINAL": "NUMBER", 
            "CARDINAL": "NUMBER", 
            "URL": "STRING",
            "DESC": "STRING",
            "TOKEN": "STRING",
            "INTEGER": "NUMBER",
            "FLOAT": "NUMBER",
            "DATETIME": "DATETIME",
            "ADDRESS": "STRING",
            "EMAIL": "STRING",
            "STRING": "STRING"
        }

        self.NE_DATATYPE = ["PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LAW", "LANGUAGE"]
    
    def most_frequent_element(self, input_list):
        counter = Counter(input_list)
        most_common = counter.most_common(1)
        return most_common[0][0] if most_common else None

    def extract_number_features(self, column):
        try:
            col = pd.to_numeric(column, errors='coerce')
            return {
                'min_value': np.min(col),
                'max_value': np.max(col),
                'mean_value': np.mean(col),
                'std_dev': np.std(col),
                'unique_count': len(set(col))
            }
        except Exception as e:
            print(f"Error extracting number features: {e}")
            return {}

    def extract_named_entity_features(self, column):
        lengths = [len(str(entry)) for entry in column]
        features = {
            'average_length': np.mean(lengths) if lengths else 0,
            'min_length': np.min(lengths) if lengths else 0,
            'max_length': np.max(lengths) if lengths else 0,
            'all_caps': sum(1 for entry in column if str(entry).isupper()),
            'capitalized': sum(1 for entry in column if str(entry).istitle()),
            'hyphens': sum(str(entry).count('-') for entry in column),
            'periods': sum(str(entry).count('.') for entry in column),
            'commas': sum(str(entry).count(',') for entry in column)
        }
        return features

    def extract_string_features(self, column):
        lengths = [len(str(entry)) for entry in column]
        features = {
            'average_length': np.mean(lengths) if lengths else 0,
            'min_length': np.min(lengths) if lengths else 0,
            'max_length': np.max(lengths) if lengths else 0,
            'all_caps': sum(1 for entry in column if str(entry).isupper()),
            'capitalized': sum(1 for entry in column if str(entry).istitle()),
            'alphabetic_chars': sum(char.isalpha() for entry in column for char in str(entry)),
            'digit_chars': sum(char.isdigit() for entry in column for char in str(entry)),
            'special_chars': sum(not char.isalnum() for entry in column for char in str(entry))
        }
        return features

    def extract_datetime_features(self, column):
        dates = pd.to_datetime(column, errors='coerce')
        features = {
            'min_date': dates.min(),
            'max_date': dates.max(),
            'year_counts': dates.dt.year.value_counts().to_dict(),
            'month_counts': dates.dt.month.value_counts().to_dict()
        }
        return features

    def extract_url_features(self, column):
        url_pattern = re.compile(r'^(https?|ftp)://[^\s/$.?#].[^\s]*$', re.IGNORECASE)
        lengths = [len(str(entry)) for entry in column]
        features = {
            'average_length': np.mean(lengths) if lengths else 0,
            'min_length': np.min(lengths) if lengths else 0,
            'max_length': np.max(lengths) if lengths else 0,
            'valid_urls': sum(1 for entry in column if re.match(url_pattern, str(entry)))
        }
        return features

    def extract_address_features(self, column):
        address_pattern = re.compile(r'\d+\s+\w+\s+(?:street|st|avenue|ave|road|rd|boulevard|blvd|lane|ln|drive|dr|court|ct|circle|cir|place|pl)\.?\s*\w*', re.IGNORECASE)
        lengths = [len(str(entry)) for entry in column]
        features = {
            'average_length': np.mean(lengths) if lengths else 0,
            'min_length': np.min(lengths) if lengths else 0,
            'max_length': np.max(lengths) if lengths else 0,
            'address_count': sum(1 for entry in column if re.match(address_pattern, str(entry)))
        }
        return features

    def extract_email_features(self, column):
        email_pattern = re.compile(r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$', re.IGNORECASE)
        lengths = [len(str(entry)) for entry in column]
        features = {
            'average_length': np.mean(lengths) if lengths else 0,
            'min_length': np.min(lengths) if lengths else 0,
            'max_length': np.max(lengths) if lengths else 0,
            'valid_emails': sum(1 for entry in column if re.match(email_pattern, str(entry)))
        }
        return features
    
    async def fetch_entity(self, session, cell):
        if cell is None or pd.isna(cell):
            return None
        cell = str(cell)
        url = 'https://lamapi.hel.sintef.cloud/lookup/entity-retrieval'
        params = {
            'name': cell,
            'token': 'lamapi_demo_2023',
            'kg': 'wikidata',
            'limit': 1000,
            #'query': f'{{"query": {{"bool": {{"must": [{{"match": {{"name": {{"query": "{cell}", "boost": 2.0}}}}}}]}}}}}}',
            #'sort': [
            #    f'''{{"popularity": {{"order": "desc"}}}}'''
            #]
        }
        async with session.get(url, params=params, ssl=False, timeout=100) as response:
            if response.status == 200:
                return await response.json()
            return None

    async def classify_columns_async(self, df):
        def combine_scores(j_score, ed_score, w1=0.5, w2=0.5):
            return w1 * j_score + w2 * ed_score

        url_pattern = re.compile(r'^(https?|ftp)://[^\s/$.?#].[^\s]*$', re.IGNORECASE)
        email_pattern = re.compile(r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$', re.IGNORECASE)
        address_pattern = re.compile(r'\d+\s+\w+\s+(?:street|st|avenue|ave|road|rd|boulevard|blvd|lane|ln|drive|dr|court|ct|circle|cir|place|pl)\.?\s*\w*', re.IGNORECASE)
        datetime_pattern = re.compile(
            r'(?:\d{4}-\d{2}-\d{2})'  # YYYY-MM-DD format
            r'|(?:31(?:\/|-|\.)0?[13578]|1[02](?:\/|-|\.)\d{4})'  # 31 days months
            r'|(?:29|30(?:\/|-|\.)0?[1,3-9]|1[0-2](?:\/|-|\.)\d{4})'  # 29/30 days months
            r'|(?:0?[1-9]|[12]\d|3[01])(?:\/|-|\.)'  # Day
            r'(?:0?[1-9]|1[0-2])(?:\/|-|\.)\d{4}'  # Month
            r'|(?:0?[1-9]|1[0-2])/(?:0?[1-9]|[12]\d|3[01])/(?:\d{2})'  # MM/DD/YY format
            r'|(?:0?[1-9]|1[0-2])/(?:0?[1-9]|[12]\d|3[01])/\d{2}'  # MM/DD/YY format
            r'\b\d{2}/(?:0?[1-9]|[12]\d|3[01])/(?:0?[1-9]|1[0-2])\b'  # YY/DD/MM format
            r'|(?:[01]?\d|2[0-3]):[0-5]\d\.[0-5]\d'  # HH:MM.SS format
            r'|(?:[01]?\d|2[0-3]):[0-5]\d'  # HH:MM format
            r'|(?:[0-5]?\d):[0-5]\d(?:\.\d{1,2})?'  # H:MM or H:MM.S format
            r'|(?:2[0-3]|[01]?\d)h[0-5]?\d(?:m[0-5]?\d(?:\.\d{1,2})?s)?',  # HhMMmSSs format
            re.IGNORECASE
        )

        col_type = []
        feature_list = []

        async with aiohttp.ClientSession() as session:
            for col_name, col_data in df.items():
                type = []
                count_cell = 0

                for cell in col_data:
                    label = None
                    is_number = False
    
                    try:
                        if math.isnan(cell):
                            label = "None"
                    except:
                        pass
                        
                    if isinstance(cell, str):
                        if cell == "NaN" or cell == "nan":
                            label = "None"
                        elif re.match(url_pattern, cell):
                            label = "URL"
                        elif re.match(email_pattern, cell):
                            label = "EMAIL"
                        elif re.match(address_pattern, cell):
                            label = "ADDRESS"
                        elif re.match(datetime_pattern, cell):
                            label = "DATETIME"
                    
                    if label is None:  # if it's none of the types below
                        try:
                            cell_str = str(cell)
                            if ',' in cell_str or '.' in cell_str or '%' in cell_str or '$' in cell_str:
                                cell_str = cell_str.replace('.', '').replace(',', '').replace('%', '').replace('$', '')
                            if len(cell_str) - len(re.findall(r'\d', cell_str)) < 5 and len(re.findall(r'\d', cell_str)) != 0:
                                is_number = True
                        except:
                            pass
                    
                    if is_number:
                        label = "NUMBER"
                    elif isinstance(cell, bool):
                        label = "STRING"
                    elif label != "None" and len(cell.split(" ")) >= 15:
                        label = "NOA"
                    elif label != "None" and len(cell.split(" ")) >= 1 and len(cell) <= 4:
                        label = "STRING"
                    
                    if label is not None:
                        type.append(label)
                    else:
                        if count_cell > 5:
                            type.append("STRING")
                            break  
                        else:                
                            tasks = [self.fetch_entity(session, cell) for cell in col_data if cell is not None and count_cell <= 5]
                            responses = await asyncio.gather(*tasks)
                            
                            for cell, data in zip(col_data, responses):
                                #print(f"{cell}-->{data[0]}")
                                try:
                                    if data and len(data) > 0 and data[0]['NERtype'] != None:
                                        if combine_scores(data[0]['jaccard_score'], data[0]['ed_score']) >= 0.7:
                                            #type.append(f"NE_{data[0]['NERtype']}")
                                            type.append(f"NE")
                                    else:
                                        # if you didn't find a NER type for this i2tem
                                        type.append("STRING")
                                    count_cell += 1
                                except:
                                    continue

                most_common_type = self.most_frequent_element(type)
                col_type.append(most_common_type)

                if most_common_type == "NUMBER":
                    features = self.extract_number_features(col_data)
                elif most_common_type in ['NE_PERS', 'NE_LOC', 'NE_ORG', 'NE_OTHERS']:
                    features = self.extract_named_entity_features(col_data)
                elif most_common_type == "STRING" or most_common_type == "NOA":
                    features = self.extract_string_features(col_data)
                elif most_common_type == "DATETIME":
                    features = self.extract_datetime_features(col_data)
                elif most_common_type == "URL":
                    features = self.extract_url_features(col_data)
                elif most_common_type == "ADDRESS":
                    features = self.extract_address_features(col_data)
                elif most_common_type == "EMAIL":
                    features = self.extract_email_features(col_data)
                else:
                    features = {}

                features['column_name'] = col_name
                features['column_type'] = most_common_type
                return most_common_type

        return feature_list

    def classify_columns(self, df):
        loop = asyncio.get_event_loop()
        return loop.run_until_complete(self.classify_columns_async(df))


In [3]:
# df1 and df2 are the same

df1 = pd.read_csv("./R1_train_df.csv")
df2 = pd.read_csv("./R3_train_df.csv")
df3 = pd.read_csv("./R4_train_df.csv")
df4 = pd.read_csv("./HT2_train_df.csv")

# filtering because otherwise the model gets values too high
df3 = df3[(df3['max_value'] <= 1.000000e+10) ]

result = pd.concat([df3, df4, df1, df2], axis=0)
result.drop(['date_range', 'year_counts', 'month_counts'], axis=1, inplace=True)


# Convert the target variable to numeric
label_encoder = LabelEncoder()

formats = ['%Y-%m-%d', '%Y-%m-%d %H:%M:%S']

for fmt in formats:
    result['min_date'] = pd.to_datetime(result['min_date'], format=fmt, errors='coerce')
    result['max_date'] = pd.to_datetime(result['max_date'], format=fmt, errors='coerce')


result['min_month'] = result['min_date'].dt.month
result['min_year'] = result['min_date'].dt.year
result['max_month'] = result['max_date'].dt.month
result['max_year'] = result['max_date'].dt.year
result = result.dropna(subset=['column_type'])


result.iloc[:, 2:26] = result.iloc[:, 2:26].fillna(-1) 
result.iloc[:, 26:30] = result.iloc[:, 26:30].fillna(0)  # fill the ['min_month', 'min_year', 'max_month', 'max_year'] 

X = result.drop(['max_date', 'min_date', 'column_name', 'column_type'], axis=1)  # Drop the target column from features
y = label_encoder.fit_transform(result['column_type'].values)

# One-hot encode the target variable for multiclass classification
y = to_categorical(y)


# see how imbalanced is the dataset
result.groupby('column_type').size().reset_index(name='count')


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Define the model
model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(16, activation='relu'),
    Dense(y_train.shape[1], activation='softmax')  # Output layer with softmax activation for multiclass classification
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy*100:.2f}%')


NameError: name 'LabelEncoder' is not defined

# NE - LIT model

In [18]:
##############################
##      SOME EXAMPLES       ##
##############################


datetime_strings = [
    "2023-02-15 08:30:00",
    "2023-03-20 18:45:00",
    "2023-04-10 10:00:00",
    "2023-05-05 14:20:00"
]

people = [
    "John Smith",
    "Mary Johnson",
    "James Williams",
    "Patricia Brown",
    "Michael Davis",
    "Jennifer Miller",
    "William Wilson",
    "Linda Moore",
    "David Taylor",
    "Barbara Anderson"
]

cities = [
    "Tokyo lake",
    "New York City lake",
    "Paris lake",
    "London lake",
    "Dubai lake",
    "Singapore",
    "Sydney",
    "Berlin",
    "Hong Kong",
    "Rio de Janeiro"
]

data = [
    "Shadows moving quietly, secrets hidden in the night.",
    "Warm sounds lingering like sunlight.",
    "Quiet thoughts drifting like clouds.",
    "The soft shift between day and night.",
    "Delicate, clear sounds like winter chimes.",
    "Faint murmurs, as soft as moonlight.",
    "A cool breeze carrying distant dreams.",
    "Dreams that echo long after waking.",
    "A calm horizon meeting the earth.",
    "Flames flickering in a graceful rhythm."
]

In [2]:
columns = [
    'min_value', 'max_value', 'mean_value', 'std_dev', 'unique_count', 'special_values',
    'average_length', 'min_length', 'max_length', 'all_caps', 'capitalized', 'hyphens', 'periods', 'commas', 'common_prefixes', 'common_suffixes',
    'alphabetic_chars', 'digit_chars', 'special_chars', 'valid_urls', 'address_count', 'valid_emails', 'min_date',
    'max_date', 'date_range', 'year_counts', 'month_counts'
]

# Initialize a single GLiNER model instance
model = GLiNER.from_pretrained("urchade/gliner_base")


# Define your label mapping
label_mapping = {
    "person": "PERS",
    "organization": "ORG",
    "location": "LOC",
    "film": "OTHERS",
    "others": "OTHERS",
    "videogames": "OTHERS",
    "date": "OTHERS",
    "galaxy": "OTHERS",
    "species": "OTHERS"
}

# Define the labels that the model should predict
new_labels = [
    "person", "organization", "location", "others", "film", "videogames", "species", "date", "galaxy"
]

# Define the prediction function
def predict_ner_types(text_list):
    results = []

    for text in text_list:
        # Predict entities for the text
        entities = model.predict_entities(text, new_labels, threshold=0.3)

        # Map the predicted entities to the target labels
        mapped_entities = []
        for entity in entities:
            mapped_entities.append({
                'text': text,
                'entity_text': entity['text'],
                'start': entity['start'],
                'end': entity['end'],
                'prediction': label_mapping.get(entity['label'], 'UNKNOWN')
            })

        results.extend(mapped_entities)

    return pd.DataFrame(results)
    

NameError: name 'GLiNER' is not defined

In [None]:
##############################
##      THE TABLES          ##
##############################

import pandas as pd
import glob
import os
from tqdm import tqdm

# Define the directory
base_dir = r'.\data\Dataset\Dataset\2T_Round4\tables'

# Use glob to find all CSV files in the directory and subdirectories
csv_files = glob.glob(os.path.join(base_dir, '**', '*.csv'), recursive=True)

# number of items per column to use in the model
len_param = 25

# Loop through the list of files with tqdm progress bar
for file in tqdm(csv_files, desc='Reading CSV files', unit='file'):
    df = pd.read_csv(file, header=None)
    print(f"{file}: ")
    for column in combined_df.columns:
        if len(column) <= len_param:
            c_kind = await column_analysis.classify_columns_async(column)
        else:
            c_kind = await column_analysis.classify_columns_async(column[:len_param])
            
        if c_kind != 'NE':
            print(f"Kind of columns: {c_kind}")
        else:
            # Get predictions
            result_df = predict_ner_types(cities)
            
            # Display the result
            counter = Counter(result_df['prediction'])
            most_common_elements = counter.most_common(1)[0]
            
            print(f"Kind of columns: {c_kind}_{most_common_elements[0]}")

    print("___________________________________")


## Testing single columns

In [34]:

#df = pd.DataFrame({'desc': ["minore di 3 anni" for i in range(0, 10)]})
#df = pd.DataFrame({'desc': datetime_strings})
df = pd.DataFrame({'desc': cities})
column_analysis = ColumnAnalysis()
df_feat = await column_analysis.classify_columns_async(df)

columns = [
    'min_value', 'max_value', 'mean_value', 'std_dev', 'unique_count', 'special_values',
    'average_length', 'min_length', 'max_length', 'all_caps', 'capitalized', 'hyphens', 'periods', 'commas', 'common_prefixes', 'common_suffixes',
    'alphabetic_chars', 'digit_chars', 'special_chars', 'valid_urls', 'address_count', 'valid_emails', 'min_date',
    'max_date', 'date_range', 'year_counts', 'month_counts'
]


In [35]:
df_feat

'NE'

In [None]:
# Initialize a single GLiNER model instance
model = GLiNER.from_pretrained("urchade/gliner_base")


In [39]:
# Define your label mapping
label_mapping = {
    "person": "PERS",
    "organization": "ORG",
    "location": "LOC",
    "film": "OTHERS",
    "others": "OTHERS",
    "videogames": "OTHERS",
    "date": "OTHERS",
    "galaxy": "OTHERS",
    "species": "OTHERS"
}

# Define the labels that the model should predict
new_labels = [
    "person", "organization", "location", "others", "film", "videogames", "species", "date", "galaxy"
]

# Define the prediction function
def predict_ner_types(text_list):
    results = []

    for text in text_list:
        # Predict entities for the text
        entities = model.predict_entities(text, new_labels, threshold=0.3)

        # Map the predicted entities to the target labels
        mapped_entities = []
        for entity in entities:
            mapped_entities.append({
                'text': text,
                'entity_text': entity['text'],
                'start': entity['start'],
                'end': entity['end'],
                'prediction': label_mapping.get(entity['label'], 'UNKNOWN')
            })

        results.extend(mapped_entities)

    return pd.DataFrame(results)

# Get predictions
result_df = predict_ner_types(cities)

# Display the result
counter = Counter(result_df['prediction'])
most_common_elements = counter.most_common(1)[0]

print(most_common_elements[0])

LOC


In [38]:
cities

['Tokyo lake',
 'New York City lake',
 'Paris lake',
 'London lake',
 'Dubai lake',
 'Singapore',
 'Sydney',
 'Berlin',
 'Hong Kong',
 'Rio de Janeiro']