In [5]:
! pip install spacy
! python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [18]:
import pandas as pd
import os
import numpy as np
import random
import json
import re
from tqdm import tqdm

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import spacy

from collections import defaultdict

In [2]:
median_lengths = []
median_token_counts = []
columns = []
target = []
average_numeric_counts = []

#spaCy functions
nlp = spacy.load("en_core_web_sm")

In [7]:
import spacy

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Define the text you want to classify
text = "Apple is looking at buying U.K. startup for $1 billion. John Doe will lead the acquisition in London."

# Process the text with the spaCy model
doc = nlp(text)

# Create a dictionary to store entities by type
entities = {"ORG": [], "PERS": [], "LOC": [], "OTHERS": []}

# Extract entities and classify them
for ent in doc.ents:
    if ent.label_ == "ORG":
        entities["ORG"].append(ent.text)
    elif ent.label_ == "PERSON":
        entities["PERS"].append(ent.text)
    elif ent.label_ == "GPE":  # GPE (Geopolitical Entity) includes locations like cities, countries, etc.
        entities["LOC"].append(ent.text)
    else:
        entities["OTHERS"].append(ent.text)

# Print the classified entities
print("Organizations:", entities["ORG"])
print("Persons:", entities["PERS"])
print("Locations:", entities["LOC"])
print("Other Entities:", entities["OTHERS"])


Organizations: ['Apple']
Persons: ['John Doe']
Locations: ['U.K.', 'London']
Other Entities: ['$1 billion']


## Round1_T2D

In [8]:
####################
# READ THE JSON
#####################

json_file_path = "./data/Round1_T2D_f3_sorted_mentions.json"

# Load the JSON file
with open(json_file_path, "r") as file:
    R1_sorted_mentions = json.load(file)

R1_cea = [item[0]for item in R1_sorted_mentions]

In [17]:
categories = [
    "Place",
    "PopulatedPlace",
    "City",
    "Country",
    "Region",
    "Mountain",
    "Island",
    "Lake",
    "River",
    "Park",
    "Building",
    "HistoricPlace",
    "Monument",
    "Bridge",
    "Road",
    "Airport",
    "Person",
    "Artist",
    "Athlete",
    "Politician",
    "Scientist",
    "Writer",
    "Actor",
    "Musician",
    "MilitaryPerson",
    "Religious",
    "Royalty",
    "Criminal",
    "Organisation",
    "Company",
    "EducationalInstitution",
    "PoliticalParty",
    "SportsTeam",
    "Non-ProfitOrganisation",
    "GovernmentAgency",
    "ReligiousOrganisation",
    "Band",
    "Library",
    "Museum",
    "Hospital",
    "University",
    "TradeUnion"
]

# Mapping of subtypes to macro classes
mapping = {
    "Place": ["PopulatedPlace", "City", "Country", "Region", "Mountain", "Island", "Lake", "River", "Park", "Building", "HistoricPlace", "Monument", "Bridge", "Road", "Airport"],
    "Person": ["Artist", "Athlete", "Politician", "Scientist", "Writer", "Actor", "Musician", "MilitaryPerson", "Religious", "Royalty", "Criminal"],
    "Organisation": ["Company", "EducationalInstitution", "PoliticalParty", "SportsTeam", "Non-ProfitOrganisation", "GovernmentAgency", "ReligiousOrganisation", "Band"],
    "Institution": ["Library", "Museum", "Hospital", "University", "TradeUnion"]
}

In [None]:
tables = "./data/Dataset/Dataset/Round1_T2D/tables/"

def count_numbers_in_string(s):
    return len(re.findall(r'\d+', str(s)))

median_lengths = []
median_token_counts = []
average_numeric_counts = []
target = []
columns = []

# Iterate through each table
for table in tqdm(os.listdir(tables)):
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    
    for col in df.columns:
        column = df[col].astype(str)
        
        # Calculate median length for the current column
        median_length = column.apply(len).median()
        median_lengths.append(median_length)
        
        # Calculate median token count for the current column
        median_token_count = column.apply(lambda x: len(x.split())).median()
        median_token_counts.append(median_token_count)
        
        # Calculate average count of numeric values in the current column
        total_numeric_count = column.apply(count_numbers_in_string).sum()
        average_numeric_count = total_numeric_count / len(df) if len(df) > 0 else 0
        average_numeric_counts.append(average_numeric_count)

        
        # Check for NE flag
        NE_flag = column.isin(R1_cea).any()
        if NE_flag:
            joined_cells = column.str.cat()        
            doc = nlp(joined_cells)
            entities = {"ORG": [], "PERS": [], "LOC": [], "OTHERS": []}

            # Extract entities and classify them
            for ent in doc.ents:
                if ent.label_ == "ORG":
                    entities["ORG"].append(ent.text)
                elif ent.label_ == "PERSON":
                    entities["PERS"].append(ent.text)
                elif ent.label_ == "GPE":  # GPE (Geopolitical Entity) includes locations like cities, countries, etc.
                    entities["LOC"].append(ent.text)
                else:
                    entities["OTHERS"].append(ent.text)
            
            # Print the classified entities
            print("Organizations:", entities["ORG"])
            print("Persons:", entities["PERS"])
            print("Locations:", entities["LOC"])
            print("Other Entities:", entities["OTHERS"])
                    
            target.append("NE")
        elif median_length - average_numeric_count < 2:
            target.append("lit")
        else:
            target.append("None")
    
    columns.extend(df.columns.tolist())

## Round3_2019

In [8]:
####################
# READ THE JSON
#####################

json_file_path = "./data/Round3_2019_sorted_mentions.json"

# Load the JSON file
with open(json_file_path, "r") as file:
    R3_sorted_mentions = json.load(file)

R3_cea = [item[0]for item in R3_sorted_mentions]

In [9]:
tables = "./data/Dataset/Dataset/Round3_2019/tables/"

def count_numbers_in_string(s):
    return len(re.findall(r'\d+', str(s)))

median_lengths = []
median_token_counts = []
average_numeric_counts = []
target = []
columns = []

# Iterate through each table
for table in tqdm(os.listdir(tables)):
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    
    for col in df.columns:
        column = df[col].astype(str)
        
        # Calculate median length for the current column
        median_length = column.apply(len).median()
        median_lengths.append(median_length)
        
        # Calculate median token count for the current column
        median_token_count = column.apply(lambda x: len(x.split())).median()
        median_token_counts.append(median_token_count)
        
        # Calculate average count of numeric values in the current column
        total_numeric_count = column.apply(count_numbers_in_string).sum()
        average_numeric_count = total_numeric_count / len(df) if len(df) > 0 else 0
        average_numeric_counts.append(average_numeric_count)
        
        # Check for NE flag
        NE_flag = column.isin(R3_cea).any()
        if NE_flag:
            target.append("NE")
        elif median_length - average_numeric_count < 2:
            target.append("lit")
        else:
            target.append("None")
    
    columns.extend(df.columns.tolist())

100%|██████████| 2162/2162 [15:12<00:00,  2.37it/s]


## 2T_Round4

In [10]:
####################
# READ THE JSON
#####################

json_file_path = "./data/2T_Round4_sorted_mentions.json"

# Load the JSON file
with open(json_file_path, "r") as file:
    R4_sorted_mentions = json.load(file)

R4_2T_cea = [item[0]for item in R4_sorted_mentions]

In [13]:
tables = "./data/Dataset/Dataset/2T_Round4/tables/"

def count_numbers_in_string(s):
    return len(re.findall(r'\d+', str(s)))

median_lengths = []
median_token_counts = []
average_numeric_counts = []
target = []
columns = []

# Iterate through each table
for table in tqdm(os.listdir(tables)):
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    
    for col in df.columns:
        column = df[col].astype(str)
        
        # Calculate median length for the current column
        median_length = column.apply(len).median()
        median_lengths.append(median_length)
        
        # Calculate median token count for the current column
        median_token_count = column.apply(lambda x: len(x.split())).median()
        median_token_counts.append(median_token_count)
        
        # Calculate average count of numeric values in the current column
        total_numeric_count = column.apply(count_numbers_in_string).sum()
        average_numeric_count = total_numeric_count / len(df) if len(df) > 0 else 0
        average_numeric_counts.append(average_numeric_count)
        
        # Check for NE flag
        NE_flag = column.isin(R4_2T_cea).any()
        if NE_flag:
            target.append("NE")
        elif median_length - average_numeric_count < 2:
            target.append("lit")
        else:
            target.append("None")
    
    columns.extend(df.columns.tolist())

100%|██████████| 180/180 [00:39<00:00,  4.61it/s]


## Round4

In [14]:
####################
# READ THE JSON
#####################

json_file_path = "./data/Round4_sorted_mentions.json"

# Load the JSON file
with open(json_file_path, "r") as file:
    R4_sorted_mentions = json.load(file)

R4_cea = [item[0]for item in R4_sorted_mentions]

In [16]:
tables = "./data/Dataset/Dataset/Round4_2020/tables/"

def count_numbers_in_string(s):
    return len(re.findall(r'\d+', str(s)))

median_lengths = []
median_token_counts = []
average_numeric_counts = []
target = []
columns = []

# Iterate through each table
for table in tqdm(os.listdir(tables)):
    table_file = os.path.join(tables, table)
    table_name = table.split(".")[0]
    df = pd.read_csv(table_file)
    
    for col in df.columns:
        column = df[col].astype(str)
        
        # Calculate median length for the current column
        median_length = column.apply(len).median()
        median_lengths.append(median_length)
        
        # Calculate median token count for the current column
        median_token_count = column.apply(lambda x: len(x.split())).median()
        median_token_counts.append(median_token_count)
        
        # Calculate average count of numeric values in the current column
        total_numeric_count = column.apply(count_numbers_in_string).sum()
        average_numeric_count = total_numeric_count / len(df) if len(df) > 0 else 0
        average_numeric_counts.append(average_numeric_count)
        
        # Check for NE flag
        NE_flag = column.isin(R4_cea).any()
        if NE_flag:
            target.append("NE")
        elif median_length - average_numeric_count < 2:
            target.append("lit")
        else:
            target.append("None")
    
    columns.extend(df.columns.tolist())

100%|██████████| 22207/22207 [21:16:10<00:00,  3.45s/it]        


## DF creation

In [17]:
# Create the DataFrame
df_def = pd.DataFrame({
    'column names': columns,
    'median_lengths': median_lengths,
    'median_token_counts': median_token_counts,
    'average_numeric_counts': average_numeric_counts,
    'target': target
})

In [18]:
df_def.to_csv('./data/NE_lit_dataset.csv', index=False)

In [20]:
###################################
#   READ DIRECTLY THE DATASET HERE
###################################

df = pd.read_csv('./data/NE_lit_dataset.csv')
filtered_df = df[df['target'].isin(['lit', 'NE'])]

# Displaying the filtered DataFrame
df[:7]

Unnamed: 0,column names,median_lengths,median_token_counts,average_numeric_counts,target
0,col0,10.0,2.0,1.65,NE
1,col1,5.0,1.0,2.0,
2,col2,6.0,1.0,2.0,
3,col0,12.0,3.0,0.826087,NE
4,col1,16.0,1.0,2.0,
5,col2,6.0,1.0,2.0,
6,col3,5.5,1.0,2.0,


In [21]:
target_counts = df['target'].value_counts()

# Extract counts for specific values
ne_count = target_counts.get("NE", 0)
lit_count = target_counts.get("lit", 0)
none_count = df.shape[0] - (ne_count+lit_count)

print(f"Count of 'NE': {ne_count}")
print(f"Count of 'lit': {lit_count}")
print(f"Count of 'NaN': {none_count}")

Count of 'NE': 51553
Count of 'lit': 1293
Count of 'NaN': 25904


## Model training

In [22]:
# Convert the target variable to numeric
label_encoder = LabelEncoder()
filtered_df['target'] = label_encoder.fit_transform(filtered_df['target'])

# One-hot encode the 'column names' column
one_hot_encoder = OneHotEncoder(sparse=False)
encoded_columns = one_hot_encoder.fit_transform(filtered_df[['column names']])

# Combine the encoded categorical data with the numeric data
numeric_data = filtered_df[['median_lengths', 'median_token_counts', 'average_numeric_counts']].values
X = np.hstack([encoded_columns, numeric_data])
y = filtered_df['target'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Define the model
model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(8, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer with sigmoid activation for binary classification
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy*100:.2f}%')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['target'] = label_encoder.fit_transform(filtered_df['target'])


Epoch 1/20
[1m1057/1057[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9039 - loss: 0.1970 - val_accuracy: 0.9907 - val_loss: 0.0334
Epoch 2/20
[1m1057/1057[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9924 - loss: 0.0271 - val_accuracy: 0.9889 - val_loss: 0.0285
Epoch 3/20
[1m1057/1057[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.9927 - loss: 0.0240 - val_accuracy: 0.9905 - val_loss: 0.0291
Epoch 4/20
[1m1057/1057[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9929 - loss: 0.0232 - val_accuracy: 0.9899 - val_loss: 0.0266
Epoch 5/20
[1m1057/1057[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.9914 - loss: 0.0265 - val_accuracy: 0.9902 - val_loss: 0.0276
Epoch 6/20
[1m1057/1057[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.9918 - loss: 0.0254 - val_accuracy: 0.9908 - val_loss: 0.0272
Epoch 7/20
[1m1

In [23]:

# Make predictions
y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype(int)
mapped_predictions = ["lit" if pred == 1 else "NE" for pred in y_pred.ravel()]

# Extract the part of X_test that corresponds to the one-hot encoded columns
encoded_columns_test = X_test[:, :encoded_columns.shape[1]]

# Inverse transform the one-hot encoded columns to get the original categorical labels
original_labels = one_hot_encoder.inverse_transform(encoded_columns_test)

# Print a few examples to check
for i in range(100):
    print(f'Original label: {original_labels[i]}, Predicted: {mapped_predictions[i]}')
    


[1m331/331[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Original label: ['col5'], Predicted: NE
Original label: ['col3'], Predicted: NE
Original label: ['col0'], Predicted: NE
Original label: ['col0'], Predicted: NE
Original label: ['col1'], Predicted: NE
Original label: ['col2'], Predicted: NE
Original label: ['col1'], Predicted: NE
Original label: ['col1'], Predicted: NE
Original label: ['col2'], Predicted: NE
Original label: ['col0'], Predicted: NE
Original label: ['col2'], Predicted: NE
Original label: ['col2'], Predicted: NE
Original label: ['col2'], Predicted: NE
Original label: ['col0'], Predicted: NE
Original label: ['col0'], Predicted: NE
Original label: ['col1'], Predicted: NE
Original label: ['col1'], Predicted: NE
Original label: ['col4'], Predicted: NE
Original label: ['col0'], Predicted: NE
Original label: ['col1'], Predicted: NE
Original label: ['col0'], Predicted: NE
Original label: ['col2'], Predicted: NE
Original label: ['col0'], Predicted: NE
Origi

In [24]:
filtered_df[:10]

Unnamed: 0,column names,median_lengths,median_token_counts,average_numeric_counts,target
0,col0,10.0,2.0,1.65,0
3,col0,12.0,3.0,0.826087,0
8,col5,5.0,1.0,1.0,0
9,col0,19.5,2.0,0.0,0
12,col0,16.5,3.0,1.0,0
13,col1,19.0,3.0,0.0,0
14,col2,13.5,2.0,0.0,0
15,col3,7.0,1.0,0.0,0
16,col0,22.0,3.0,0.0,0
17,col1,23.0,3.0,0.0,0
