# Entity extraction (people, organizations, technologies)

Load packages

In [2]:
import pandas as pd
from collections import Counter
from itertools import chain
import numpy as np
import ast

Connect to google drive

In [3]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Mounted at /content/drive


Load data

In [4]:
df = pd.read_parquet("/content/drive/MyDrive/Colab Notebooks/temp_data_md_entities.parquet", engine="pyarrow")

In [5]:
df.head()

Unnamed: 0,index,url,date,language,title,text,clean_text_with_punct,clean_text,topic_BERT,topic_BERT_prob,Topic,Name,Predicted_Industry,named_entities
0,0,https://pjmedia.com/instapundit/494418/,2022-01-01,en,Instapundit » Blog Archive » HMM: An Advan...,\n\n\nInstapundit » Blog Archive » HMM: An...,Instapundit Blog Archive HMM: An Advanced AI G...,instapundit blog archive hmm advanced give un...,247,0.45575,247,247_instapundit_heather havrilesky_tcs daily_t...,Technology,"{'LOCATION': ['NEW YORK CITY', 'New Y', 'Iraq'..."
1,1,https://www.vikatan.com/amp/story/automobile%2...,2022-01-01,en,Artificial Intelligence Reshaping the Automoti...,Artificial Intelligence Reshaping the Automoti...,Artificial Intelligence Reshaping the Automoti...,reshape automotive industryx vikatan original...,20,0.599253,20,20_automotive industry_automotive_automaker_el...,Sport,"{'LOCATION': [], 'ORG': ['Vikatan Originals'],..."
2,2,https://www.equities.com/news/exchange-listed-...,2022-01-01,en,Exchange Listed Funds - QRAFT AI Enhanced U.S....,\n\nExchange Listed Funds - QRAFT AI Enhanced ...,Exchange Listed Funds - QRAFT AI Enhanced U.S....,exchange list fund qraft enhance large cap et...,-1,0.0,-1,-1_tech_industry_technology_feature,Technology,"{'LOCATION': ['US', 'China', 'Xinjiang'], 'ORG..."
3,3,https://www.financialexpress.com/lifestyle/boo...,2022-01-01,en,Book Review: The Age of AI and Our Human Futur...,\n\nBook Review: The Age of AI and Our Human F...,Book Review: The Age of AI and Our Human Futur...,book review age human future henry kissinger ...,-1,0.0,-1,-1_tech_industry_technology_feature,Technology,"{'LOCATION': ['India', 'US', 'India', 'US', 'U..."
4,4,https://www.newsbreak.com/news/2470570042311/5...,2022-01-01,en,5 Places to Start a Career in Data Science in ...,5 Places to Start a Career in Data Science in ...,5 Places to Start a Career in Data Science in -,place start career datum science,492,1.0,492,492_india magazine_analytic india_datum scient...,Technology,"{'LOCATION': [], 'ORG': [], 'PERSON': [], 'TEC..."


In [14]:
df.loc[0,'named_entities']

{'LOCATION': array(['NEW YORK CITY', 'New Y', 'Iraq', 'Mudville Gazette', 'N.Z.',
        'California', 'California', 'U.S.'], dtype=object),
 'ORG': array(['InstaPundit', 'the Amazon Services LLC Associates Program',
        'Amazon', 'Instapundit Archives', 'Cliopatria Deceiver.com',
        'The Tatler WiFi', 'ALDaily', 'PoliticalWire', 'RealClearPolitics',
        'TCS', 'The Note', 'RealClearPolitics', 'SCSU', 'WordPress',
        'Reg.'], dtype=object),
 'PERSON': array(['ERIC ADAMS', 'Eric Adams', 'Glenn Reynolds',
        'Andrew Malcolm Barone', 'Bluey', 'Chris Anderson Cox', 'Forkum',
        'Dan Gillmor', 'Dave Barry', 'David Corn', 'David Frum',
        'Don Surber', 'Ernie the', 'Flit Fraters', 'Libertas',
        'Gary Farber', 'Howard Kurtz', 'Hugh Hewitt', 'Jennifer Rubin',
        'Jim Geraghty', 'Josh Marshall', 'Jules Crittenden',
        'JWR Kausfiles', 'Kevin Drum', 'Larry Kudlow', 'Mark Steyn',
        'Martin Peretz', 'Michael Silence', 'Michelle Malkin',
     

Normalize entity list

In [6]:
def normalize_entity_list(entity_list):
    if isinstance(entity_list, str):
        try:
            entity_list = ast.literal_eval(entity_list)
        except Exception:
            return []

    # Accept both list and np.ndarray
    if isinstance(entity_list, (list, np.ndarray)):
        return [ent.strip().lower().title() for ent in entity_list if isinstance(ent, str) and ent.strip()]

    return []


In [7]:
# Normalize named entities
df['normalized_entities'] = df['named_entities'].apply(lambda ents: {
    k: normalize_entity_list(v) for k, v in ents.items()
})

In [8]:
df.head()

Unnamed: 0,index,url,date,language,title,text,clean_text_with_punct,clean_text,topic_BERT,topic_BERT_prob,Topic,Name,Predicted_Industry,named_entities,normalized_entities
0,0,https://pjmedia.com/instapundit/494418/,2022-01-01,en,Instapundit » Blog Archive » HMM: An Advan...,\n\n\nInstapundit » Blog Archive » HMM: An...,Instapundit Blog Archive HMM: An Advanced AI G...,instapundit blog archive hmm advanced give un...,247,0.45575,247,247_instapundit_heather havrilesky_tcs daily_t...,Technology,"{'LOCATION': ['NEW YORK CITY', 'New Y', 'Iraq'...","{'LOCATION': ['New York City', 'New Y', 'Iraq'..."
1,1,https://www.vikatan.com/amp/story/automobile%2...,2022-01-01,en,Artificial Intelligence Reshaping the Automoti...,Artificial Intelligence Reshaping the Automoti...,Artificial Intelligence Reshaping the Automoti...,reshape automotive industryx vikatan original...,20,0.599253,20,20_automotive industry_automotive_automaker_el...,Sport,"{'LOCATION': [], 'ORG': ['Vikatan Originals'],...","{'LOCATION': [], 'ORG': ['Vikatan Originals'],..."
2,2,https://www.equities.com/news/exchange-listed-...,2022-01-01,en,Exchange Listed Funds - QRAFT AI Enhanced U.S....,\n\nExchange Listed Funds - QRAFT AI Enhanced ...,Exchange Listed Funds - QRAFT AI Enhanced U.S....,exchange list fund qraft enhance large cap et...,-1,0.0,-1,-1_tech_industry_technology_feature,Technology,"{'LOCATION': ['US', 'China', 'Xinjiang'], 'ORG...","{'LOCATION': ['Us', 'China', 'Xinjiang'], 'ORG..."
3,3,https://www.financialexpress.com/lifestyle/boo...,2022-01-01,en,Book Review: The Age of AI and Our Human Futur...,\n\nBook Review: The Age of AI and Our Human F...,Book Review: The Age of AI and Our Human Futur...,book review age human future henry kissinger ...,-1,0.0,-1,-1_tech_industry_technology_feature,Technology,"{'LOCATION': ['India', 'US', 'India', 'US', 'U...","{'LOCATION': ['India', 'Us', 'India', 'Us', 'U..."
4,4,https://www.newsbreak.com/news/2470570042311/5...,2022-01-01,en,5 Places to Start a Career in Data Science in ...,5 Places to Start a Career in Data Science in ...,5 Places to Start a Career in Data Science in -,place start career datum science,492,1.0,492,492_india magazine_analytic india_datum scient...,Technology,"{'LOCATION': [], 'ORG': [], 'PERSON': [], 'TEC...","{'LOCATION': [], 'ORG': [], 'PERSON': [], 'TEC..."


Extract and count organizations, people, technology, and locations

In [9]:
orgs = Counter(chain.from_iterable(df['normalized_entities'].dropna().apply(lambda x: x['ORG'])))
people = Counter(chain.from_iterable(df['normalized_entities'].dropna().apply(lambda x: x['PERSON'])))
techs = Counter(chain.from_iterable(df['normalized_entities'].dropna().apply(lambda x: x['TECH'])))
locations = Counter(chain.from_iterable(df['normalized_entities'].dropna().apply(lambda x: x['LOCATION'])))

In [10]:
# Show top 100 results
print("\nTop Organizations:\n", orgs.most_common(100))
print("\nTop People:\n", people.most_common(100))
print("\nTop Technologies:\n", techs.most_common(100))
print("\nTop Locations:\n", locations.most_common(100))


Top Organizations:
 [('Openai', 164609), ('Google', 163236), ('Microsoft', 132160), ('Apple', 80942), ('Nvidia', 66114), ('Chatgpt', 57172), ('Amazon', 52961), ('Lg', 46763), ('Fcc', 40113), ('Meta', 38344), ('Facebook', 36808), ('Samsung', 35254), ('Youtube', 33292), ('Linkedin', 28920), ('Twitter', 22338), ('Digi Communications N.V.', 20767), ('Nasdaq', 19840), ('Ap', 18394), ('Intel', 17799), ('Instagram', 17206), ('Amd', 17030), ('Eu', 16932), ('Tesla', 16597), ('Ibm', 15404), ('Tiktok', 14682), ('Bing', 13649), ('The Associated Press', 13276), ('Netflix', 13100), ('Associated Press', 13097), ('Menafn', 12726), ('Congress', 12685), ('Anthropic', 11958), ('Ein', 11634), ('Google Play', 11428), ('Aws', 11393), ('Alphabet', 11010), ('Cnbc', 10542), ('Adobe', 9699), ('Getty Images', 9673), ('Fda', 9449), ('Nexstar', 8998), ('Nexstar Media Inc.', 8638), ('Nfl', 8620), ('Newsfeed', 8497), ('Reddit', 8453), ('Cna', 8136), ('Mint', 8057), ('Whatsapp', 7971), ('Deepseek', 7853), ('The Motl

Group names

In [27]:
# Alias groups for consolidation
alias_groups = {
    'Elon Musk': ['Musk', 'Elon Musk', "Elon Musk'S", 'Elon Musk S'],
    'Donald Trump': ['Trump', 'Donald Trump'],
    'Joe Biden': ['Biden', 'Joe Biden'],
    'Sam Altman': ['Altman', 'Sam Altman'],
    'Narendra Modi': ['Modi', 'Narendra Modi'],
    'Kamala Harris': ['Harris', 'Kamala Harris'],
    'Rishi Sunak': ['Sunak', 'Rishi Sunak'],
    'Prince Harry': ['Harry', 'Prince Harry'],
    'Meghan Markle': ['Meghan', 'Meghan Markle'],
    'Satya Nadella': ['Nadella', 'Satya Nadella'],
    'Sundar Pichai': ['Pichai', 'Sundar Pichai'],
    'Mark Zuckerberg': ['Zuckerberg', 'Mark Zuckerberg'],
    'Jensen Huang': ['Huang', 'Jensen Huang', 'Jensen'],
    'Warren Buffett': ['Buffett', 'Warren Buffett'],
    'Jeff Bezos': ['Bezos', 'Jeff Bezos'],
    'Bill Gates': ['Gates', 'Bill Gates'],
    'Tim Cook': ['Cook', 'Tim Cook'],
    'Geoffrey Hinton': ['Hinton', 'Geoffrey Hinton'],
    'Xi Jinping': ['Xi', 'Xi Jinping'],
    'Barack Obama': ['Obama', 'Barack Obama'],
    'Taylor Swift': ['Taylor Swift', "Taylor Swift'S", 'Swift'],
    'Paul McCartney': ['Paul Mccartney', 'Mccartney', 'Paul'],
    'Scarlett Johansson': ['Scarlett Johansson', 'Johansson'],
    'Chuck Schumer': ['Schumer', 'Chuck Schumer'],
    'Ilya Sutskever': ['Sutskever', 'Ilya Sutskever'],
    'Greg Brockman': ['Greg Brockman', 'Brockman'],
    'Kate Middleton': ['Kate', 'Kate Middleton'],
    'Prince William': ['William', 'Prince William'],
    'Khloe Kardashian': ['Khloe Kardashian', 'Khloe'],
    'Kim Kardashian': ['Kim Kardashian', 'Kim'],
    'Kanye West': ['Kanye West', 'Ye']
}


# Consolidate the counts from original Counter
consolidated_people = {}
for canonical, aliases in alias_groups.items():
    total = sum(people.get(alias, 0) for alias in aliases)
    consolidated_people[canonical] = total

# Add back the rest of people not in any alias group
all_aliases = set(alias for aliases in alias_groups.values() for alias in aliases)
for name, count in people.items():
    if name not in all_aliases:
        consolidated_people[name] = count

# Convert back to Counter and show top 100
people_merged = Counter(consolidated_people)
print("\nTop People (Merged):\n", people_merged.most_common(100))



Top People (Merged):
 [('Elon Musk', 45397), ('Sam Altman', 42933), ('Donald Trump', 38362), ('Joe Biden', 30502), ('Taylor Swift', 10943), ('Narendra Modi', 10050), ('Kamala Harris', 6366), ('Mark Zuckerberg', 6347), ('Satya Nadella', 6295), ('Rishi Sunak', 5806), ('Sundar Pichai', 5545), ('Prince Harry', 5452), ('Kim Kardashian', 5032), ('Jensen Huang', 4852), ('Meghan Markle', 4748), ('Putin', 4420), ('Paul McCartney', 4407), ('Bobby Allyn', 3960), ('Geoffrey Hinton', 3886), ('Bill Gates', 3843), ('Scarlett Johansson', 3677), ('Greg Brockman', 3516), ('Warren Buffett', 3506), ('Kate Middleton', 3497), ('Chuck Schumer', 3402), ('Bard', 3353), ('Xi Jinping', 3325), ('Charles', 3281), ("Matt O'Brien", 2811), ('Ilya Sutskever', 2750), ('Travis Kelce', 2701), ('David', 2612), ('Prince William', 2505), ('Metla Sudha Sekhar', 2450), ('Jennifer Lopez', 2410), ('Drake', 2275), ('Phil Mackintosh', 2147), ('Barbie', 2124), ('Tim Cook', 2040), ('Desantis', 1908), ('Ben Affleck', 1897), ('Kylie

Group organizations

In [13]:
org_alias_groups = {
    'Google': ['Google', 'Google Play', 'Google Cloud'],
    'Microsoft': ['Microsoft', 'Windows'],
    'Meta': ['Meta', 'Facebook', 'Instagram'],
    'Amazon': ['Amazon', 'Aws'],
    'Nvidia': ['Nvidia'],
    'Apple': ['Apple'],
    'OpenAI': ['Openai', 'Chatgpt'],
    'IBM': ['Ibm'],
    'AMD': ['Amd'],
    'Intel': ['Intel'],
    'Tesla': ['Tesla'],
    'Netflix': ['Netflix'],
    'Samsung': ['Samsung'],
    'YouTube': ['Youtube'],
    'LinkedIn': ['Linkedin'],
    'Twitter': ['Twitter', 'X'],
    'CNBC': ['Cnbc'],
    'Reuters': ['Ap', 'The Associated Press', 'Associated Press'],
    'Nexstar': ['Nexstar', 'Nexstar Media Inc.'],
    'Nasdaq': ['Nasdaq', 'Nasdaq.Com', 'Nasdaq, Inc.'],
    'White House': ['White House', 'The White House'],
    'New York Times': ['The New York Times', 'Times'],
    'Digi Communications': ['Digi Communications N.V.', 'Digi Communications Nv']
}

# Merge organization names
merged_orgs = {}
for canonical, aliases in org_alias_groups.items():
    total = sum(orgs.get(alias, 0) for alias in aliases)
    merged_orgs[canonical] = total

# Add orgs not in any alias group
all_org_aliases = set(alias for aliases in org_alias_groups.values() for alias in aliases)
for name, count in orgs.items():
    if name not in all_org_aliases:
        merged_orgs[name] = count

# Final sorted result
merged_orgs_counter = Counter(merged_orgs)
print("\nTop Merged Organizations:\n", merged_orgs_counter.most_common(100))


Top Merged Organizations:
 [('OpenAI', 221781), ('Google', 179615), ('Microsoft', 136731), ('Meta', 92358), ('Apple', 80942), ('Nvidia', 66114), ('Amazon', 64354), ('Lg', 46763), ('Reuters', 44767), ('Fcc', 40113), ('Samsung', 35254), ('YouTube', 33292), ('Nasdaq', 29920), ('LinkedIn', 28920), ('Twitter', 27050), ('Digi Communications N.V.', 20767), ('Intel', 17799), ('Nexstar', 17636), ('AMD', 17030), ('Eu', 16932), ('Tesla', 16597), ('IBM', 15404), ('Tiktok', 14682), ('Bing', 13649), ('Netflix', 13100), ('Menafn', 12726), ('Congress', 12685), ('Anthropic', 11958), ('Ein', 11634), ('Alphabet', 11010), ('CNBC', 10542), ('White House', 10113), ('Adobe', 9699), ('Getty Images', 9673), ('Fda', 9449), ('Nfl', 8620), ('Newsfeed', 8497), ('Reddit', 8453), ('Cna', 8136), ('Mint', 8057), ('Whatsapp', 7971), ('Deepseek', 7853), ('The Motley Fool', 7672), ('Bloomberg', 7669), ('Afp', 7582), ('Sony', 7281), ('Android', 7085), ('Salesforce', 6878), ('Dell', 6397), ('Sec', 6304), ('Fox', 6262), ('

Group technology

In [31]:
tech_alias_groups = {
    'ChatGPT': ['Chatgpt', 'Chat Gpt', 'Chatgpt Plus', 'Chatgpt Enterprise', 'Chatgpt Pro',
                'Chatgpt 4', 'Chatgpt-4', 'Chatgpt Ai', 'Chatgpt Search', 'Chatgpt 4O',
                'Gpt', 'Gptzero', 'Gpts', 'Gptzero', 'Gpt-3', 'Gpt 3.5', 'Gpt-4', 'Gpt-4 Turbo', 'Gpt4', 'Gpt-4O', 'Gpt-4O Mini'],
    'AI Pin': ['Ai Pin', 'The Ai Pin', 'The Humane Ai Pin'],
    'Google Cloud': ['Google Cloud', 'Google Cloud S'],
    'OpenAI': ['Openai', 'Azure Openai', 'Azure Openai Service'],
    'Bard': ['Bard', 'Google Bard', 'Bard Ai'],
    'LLaMA': ['Llama', 'Llama 2', 'Llama 3'],
    'Generative AI': ['Generative Ai', 'Genai', 'Gen Ai', 'Artificial Intelligence', 'Ai'],
    'Meta AI': ['Meta Ai'],
    'Copilot': ['Copilot Ai', 'Copilot'],
    'Stability AI': ['Stability Ai'],
    'Mistral AI': ['Mistral Ai'],
    'Palm AI': ['Palm Ai'],
    'Vertex AI': ['Vertex Ai'],
    'Gemini AI': ['Gemini Ai'],
    'Soundhound AI': ['Soundhound Ai'],
    'Grok AI': ['Grok Ai', 'Grok Chatbot'],
    'Duet AI': ['Duet Ai'],
    'TruthGPT': ['Truthgpt'],
    'Arbdoge AI': ['Arbdoge Ai'],
    'Thinq AI': ['Thinq Ai', 'Ai Thinq'],
    'LangChain': ['Langchain'],
    'Apache Spark': ['Spark', 'Apache Spark'],
    'Smart Devices': ['Smartwatch', 'Smart Tv', 'Smart Home', 'Smart Thermostat'],
    'Voice Assistants': ['Clear Voice Pro']
}

normalized_tech = Counter()
for tech, count in techs.items():
    normalized_key = tech.strip().lower().title()
    normalized_tech[normalized_key] += count

# normalize group
normalized_alias_groups = {
    canonical: [alias.strip().lower().title() for alias in aliases]
    for canonical, aliases in tech_alias_groups.items()
}

# count
merged_tech = {}
for canonical, aliases in normalized_alias_groups.items():
    total = sum(normalized_tech.get(alias, 0) for alias in aliases)
    merged_tech[canonical] = total

# Add tech not in any alias group
all_aliases = set(alias for aliases in normalized_alias_groups.values() for alias in aliases)
for tech, count in normalized_tech.items():
    if tech not in all_aliases:
        merged_tech[tech] = count

# Final sorted result
merged_tech_counter = Counter(merged_tech)
print("\nTop Merged Technologies:\n", merged_tech_counter.most_common(100))


Top Merged Technologies:
 [('ChatGPT', 135871), ('Bard', 14324), ('Generative AI', 6217), ('OpenAI', 3896), ('LLaMA', 3524), ('Ultragear', 2660), ('Meta AI', 2448), ('Galaxy Ai', 2111), ('Gmail', 1863), ('Vision Pro', 1624), ('Metaverse', 1548), ('Smart Devices', 1471), ('Safari', 1325), ('Airpods', 1277), ('Apple Vision Pro', 1258), ('AI Pin', 1074), ('Node Ai', 1043), ('Thinq AI', 1022), ('Arbdoge AI', 981), ('Search', 972), ('Gemini AI', 958), ('Sleepless Ai', 919), ('Vertex AI', 879), ('Quantum Ai', 861), ('Google Cloud', 860), ('Macbook Air', 837), ('Barchart', 819), ('Grok AI', 798), ('My Ai', 750), ('Cybertruck', 726), ('Java', 726), ('Apache Spark', 723), ('Stargate', 723), ('Paal Ai', 663), ('Palm AI', 609), ('Ipad Air', 587), ('Lunar Lake', 569), ('Searchgpt', 560), ('Arm', 552), ('Stability AI', 548), ('Duet AI', 519), ('Tech Radar Pro', 517), ('Ai Dd', 511), ('Xai', 481), ('Edge Ai', 475), ('Starlink', 442), ('Lightchain Ai', 437), ('Conversational Ai', 434), ('Dolby Visio

Group locations

In [24]:
location_alias_groups = {
    'United States': [
        'Us', 'U.S.', 'The United States', 'America', 'Usa',
        'New York', 'Nyc', 'California', 'San Francisco', 'Texas', 'Florida', 'Washington', 'Illinois',
        'Colorado', 'Ohio', 'Pennsylvania', 'Michigan', 'Utah', 'Oklahoma', 'Connecticut', 'New Mexico',
        'Arizona', 'Arkansas', 'Idaho', 'Iowa', 'Minnesota', 'Missouri', 'Indiana', 'North Carolina',
        'Massachusetts', 'Hawaii', 'Tennessee', 'Oregon', 'New Jersey', 'Wisconsin', 'Louisiana',
        'Montana', 'Kansas', 'New Hampshire', 'Alabama', 'Delaware', 'Virginia', 'Maryland',
        'Kentucky', 'South Carolina', 'West Virginia', 'Nevada', 'Mississippi', 'Alaska',
        'North Dakota', 'South Dakota', 'Nebraska', 'Las Vegas', 'Chicago', 'Los Angeles'
    ],
    'United Kingdom': ['Uk', 'United Kingdom', 'London'],
    'Russia': ['Russia'],
    'China': ['China'],
    'India': ['India', 'Delhi', 'Mumbai'],
    'Canada': ['Canada'],
    'Australia': ['Australia'],
    'Germany': ['Germany'],
    'France': ['France'],
    'Japan': ['Japan'],
    'South Korea': ['South Korea'],
    'Middle East': ['Middle East', 'Saudi Arabia', 'Uae', 'Qatar', 'Kuwait', 'Oman', 'Bahrain', 'Jordan'],
    'Israel': ['Israel'],
    'Ukraine': ['Ukraine'],
    'Brazil': ['Brazil'],
    'Mexico': ['Mexico'],
    'North America': ['North America'],
    'Europe': ['Europe'],
    'Asia': ['Asia'],
    'Taiwan': ['Taiwan'],
    'Hong Kong': ['Hong Kong'],
    'Africa': ['South Africa', 'Kenya', 'Morocco', 'Algeria', 'Tunisia'],
}

# Normalize original locations dictionary
normalized_locations = Counter()
for loc, count in locations.items():
    normalized_locations[loc.strip().lower().title()] += count

# Normalize alias group mappings
normalized_alias_groups = {
    canonical: [alias.strip().lower().title() for alias in aliases]
    for canonical, aliases in location_alias_groups.items()
}

# Merge location counts based on alias groups
merged_locations = {}
for canonical, aliases in normalized_alias_groups.items():
    total = sum(normalized_locations.get(alias, 0) for alias in aliases)
    merged_locations[canonical] = total

# Add any locations not captured in alias groups
all_aliases = set(alias for aliases in normalized_alias_groups.values() for alias in aliases)
for loc, count in normalized_locations.items():
    if loc not in all_aliases:
        merged_locations[loc] = count

# Display result
merged_locations_counter = Counter(merged_locations)
print("\nTop Merged Locations:\n", merged_locations_counter.most_common(30))


Top Merged Locations:
 [('India', 128892), ('Middle East', 116765), ('United Kingdom', 82948), ('China', 71526), ('Canada', 35509), ('Europe', 32864), ('Russia', 26779), ('United States', 26465), ('Ukraine', 26311), ('Australia', 25630), ('Israel', 25444), ('France', 23751), ('Japan', 22038), ('Germany', 20593), ('Italy', 18055), ('Singapore', 17860), ('Pakistan', 17625), ('Spain', 16429), ('Lebanon', 16205), ('Georgia', 16183), ('Egypt', 13644), ('Brazil', 13618), ('Maine', 13265), ('Indonesia', 13244), ('Taiwan', 13235), ('Hollywood', 13171), ('Mexico', 13026), ('Portugal', 12793), ('Malaysia', 12579), ('Asia', 12505)]


In [25]:
# --- STEP 1: Normalize the original locations ---
normalized_locations = Counter()
for loc, count in locations.items():
    normalized_loc = loc.strip().lower().title()
    normalized_locations[normalized_loc] += count

# --- STEP 2: Define alias groups (normalized as well) ---
location_alias_groups = {
    'United States': [
        'Us', 'U.S.', 'The United States', 'America', 'Usa',
        'New York', 'Nyc', 'California', 'San Francisco', 'Texas', 'Florida', 'Washington', 'Illinois',
        'Colorado', 'Ohio', 'Pennsylvania', 'Michigan', 'Utah', 'Oklahoma', 'Connecticut', 'New Mexico',
        'Arizona', 'Arkansas', 'Idaho', 'Iowa', 'Minnesota', 'Missouri', 'Indiana', 'North Carolina',
        'Massachusetts', 'Hawaii', 'Tennessee', 'Oregon', 'New Jersey', 'Wisconsin', 'Louisiana',
        'Montana', 'Kansas', 'New Hampshire', 'Alabama', 'Delaware', 'Virginia', 'Maryland',
        'Kentucky', 'South Carolina', 'West Virginia', 'Nevada', 'Mississippi', 'Alaska',
        'North Dakota', 'South Dakota', 'Nebraska', 'Las Vegas', 'Chicago', 'Los Angeles'
    ],
    'United Kingdom': ['Uk', 'United Kingdom', 'London'],
    'Russia': ['Russia'],
    'China': ['China'],
    'India': ['India', 'Delhi', 'Mumbai'],
    'Canada': ['Canada'],
    'Australia': ['Australia'],
    'Germany': ['Germany'],
    'France': ['France'],
    'Japan': ['Japan'],
    'South Korea': ['South Korea'],
    'Middle East': ['Middle East', 'Saudi Arabia', 'Uae', 'Qatar', 'Kuwait', 'Oman', 'Bahrain', 'Jordan'],
    'Israel': ['Israel'],
    'Ukraine': ['Ukraine'],
    'Brazil': ['Brazil'],
    'Mexico': ['Mexico'],
    'North America': ['North America'],
    'Europe': ['Europe'],
    'Asia': ['Asia'],
    'Taiwan': ['Taiwan'],
    'Hong Kong': ['Hong Kong'],
    'Africa': ['South Africa', 'Kenya', 'Morocco', 'Algeria', 'Tunisia'],
}

# Normalize the alias group entries
normalized_alias_groups = {
    canonical: [alias.strip().lower().title() for alias in aliases]
    for canonical, aliases in location_alias_groups.items()
}

# --- STEP 3: Merge counts by alias ---
merged_locations = {}
for canonical, aliases in normalized_alias_groups.items():
    total = sum(normalized_locations.get(alias, 0) for alias in aliases)
    merged_locations[canonical] = total

# --- STEP 4: Add locations not in any alias group ---
all_aliases = set(alias for aliases in normalized_alias_groups.values() for alias in aliases)
for loc, count in normalized_locations.items():
    if loc not in all_aliases:
        merged_locations[loc] = count

# --- STEP 5: Print result ---
merged_locations_counter = Counter(merged_locations)
print("\nTop Merged Locations:\n", merged_locations_counter.most_common(30))


Top Merged Locations:
 [('India', 128892), ('Middle East', 116765), ('United Kingdom', 82948), ('China', 71526), ('Canada', 35509), ('Europe', 32864), ('Russia', 26779), ('United States', 26465), ('Ukraine', 26311), ('Australia', 25630), ('Israel', 25444), ('France', 23751), ('Japan', 22038), ('Germany', 20593), ('Italy', 18055), ('Singapore', 17860), ('Pakistan', 17625), ('Spain', 16429), ('Lebanon', 16205), ('Georgia', 16183), ('Egypt', 13644), ('Brazil', 13618), ('Maine', 13265), ('Indonesia', 13244), ('Taiwan', 13235), ('Hollywood', 13171), ('Mexico', 13026), ('Portugal', 12793), ('Malaysia', 12579), ('Asia', 12505)]
