In [6]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Example data
data = {'entity': ['Mistral', 'Mistral AI', 'OtherEntity', 'MistralAI']}
df = pd.DataFrame(data)

# Step 3: Define a function to group similar words
def group_similar_words(df, column, threshold=80):
    unique_words = df[column].unique()
    grouped_words = {}
    
    for word in unique_words:
        if word not in grouped_words:
            matches = process.extract(word, unique_words, scorer=fuzz.token_sort_ratio)
            similar_words = [match[0] for match in matches if match[1] >= threshold]
            for similar_word in similar_words:
                grouped_words[similar_word] = word
    
    df['grouped_entity'] = df[column].map(grouped_words)
    return df


In [7]:
df = group_similar_words(df, 'entity')
print(df)

        entity grouped_entity
0      Mistral        Mistral
1   Mistral AI        Mistral
2  OtherEntity    OtherEntity
3    MistralAI        Mistral


In [12]:
df = pd.read_csv("ner_graph.csv")

In [11]:
# Step 1: Install the rapidfuzz library
# Open your terminal and run:
# pip install rapidfuzz

# Step 2: Load your data
import pandas as pd
from rapidfuzz import fuzz, process

# Example data
data = {'entity': ['Mistral', 'Mistral_AI', 'OtherEntity', 'MistralAI']}
df = pd.DataFrame(data)

# Step 3: Define a function to group similar words
def group_similar_words(df, column, threshold=80):
    unique_words = df[column].unique()
    grouped_words = {}
    
    for word in unique_words:
        if word not in grouped_words:
            matches = process.extract(word, unique_words, scorer=fuzz.token_sort_ratio, score_cutoff=threshold)
            similar_words = [match[0] for match in matches]
            for similar_word in similar_words:
                grouped_words[similar_word] = word
    
    df['grouped_entity'] = df[column].map(grouped_words)
    return df

# Apply the function to your dataframe
df = group_similar_words(df, 'entity')
print(df)

        entity grouped_entity
0      Mistral        Mistral
1   Mistral_AI        Mistral
2  OtherEntity    OtherEntity
3    MistralAI        Mistral


In [19]:
df = group_similar_words(original_df, 'entity')
print(df)

df

                                           original_text  \
0      Starbucks violated federal labor law when it i...   
1      Starbucks violated federal labor law when it i...   
2      Starbucks violated federal labor law when it i...   
3      Starbucks violated federal labor law when it i...   
4      Starbucks violated federal labor law when it i...   
...                                                  ...   
21318  "These changes will also strengthen laws gover...   
21319  "These changes will also strengthen laws gover...   
21320  "These changes will also strengthen laws gover...   
21321  "These changes will also strengthen laws gover...   
21322  "These changes will also strengthen laws gover...   

                               entity    label                  grouped_entity  
0      National Labor Relations Board      ORG  National Labor Relations Board  
1                            Thursday     DATE                        Thursday  
2                                NLR

Unnamed: 0,original_text,entity,label,grouped_entity
0,Starbucks violated federal labor law when it i...,National Labor Relations Board,ORG,National Labor Relations Board
1,Starbucks violated federal labor law when it i...,Thursday,DATE,Thursday
2,Starbucks violated federal labor law when it i...,NLRB,ORG,NLRB
3,Starbucks violated federal labor law when it i...,Starbucks,ORG,Starbucks
4,Starbucks violated federal labor law when it i...,Board,ORG,Board
...,...,...,...,...
21318,"""These changes will also strengthen laws gover...",the past weekend,DATE,the last week
21319,"""These changes will also strengthen laws gover...",30000,MONEY,40000
21320,"""These changes will also strengthen laws gover...",first,ORDINAL,first
21321,"""These changes will also strengthen laws gover...",Bloomberg,ORG,Bloomberg


In [17]:
original_df = pd.read_csv("ner_graph.csv")
df['entity'] = df['grouped_entity']


df

Unnamed: 0,original_text,entity,label,grouped_entity
0,Starbucks violated federal labor law when it i...,National Labor Relations Board,ORG,National Labor Relations Board
1,Starbucks violated federal labor law when it i...,Thursday,DATE,Thursday
2,Starbucks violated federal labor law when it i...,NLRB,ORG,NLRB
3,Starbucks violated federal labor law when it i...,Starbucks,ORG,Starbucks
4,Starbucks violated federal labor law when it i...,Board,ORG,Board
...,...,...,...,...
21318,"""These changes will also strengthen laws gover...",the last week,DATE,the last week
21319,"""These changes will also strengthen laws gover...",40000,MONEY,40000
21320,"""These changes will also strengthen laws gover...",first,ORDINAL,first
21321,"""These changes will also strengthen laws gover...",Bloomberg,ORG,Bloomberg


In [20]:
def is_number(s):
    try:
        # Try to convert to float
        float(s.replace(',', ''))  # Handle commas in numbers like '30,000'
        return True
    except ValueError:
        return False


df['entity'] = df.apply(lambda row: row['grouped_entity'] if not is_number(row['entity']) else row['entity'], axis=1)
    


In [21]:
df

Unnamed: 0,original_text,entity,label,grouped_entity
0,Starbucks violated federal labor law when it i...,National Labor Relations Board,ORG,National Labor Relations Board
1,Starbucks violated federal labor law when it i...,Thursday,DATE,Thursday
2,Starbucks violated federal labor law when it i...,NLRB,ORG,NLRB
3,Starbucks violated federal labor law when it i...,Starbucks,ORG,Starbucks
4,Starbucks violated federal labor law when it i...,Board,ORG,Board
...,...,...,...,...
21318,"""These changes will also strengthen laws gover...",the last week,DATE,the last week
21319,"""These changes will also strengthen laws gover...",30000,MONEY,40000
21320,"""These changes will also strengthen laws gover...",first,ORDINAL,first
21321,"""These changes will also strengthen laws gover...",Bloomberg,ORG,Bloomberg
