### 1.6 Intro to NLP and Network Analysis with NER

#### This scripts contains the following:
1. Importing libraries and data
2. Creating NER object
3. Splitting sentence entities from the NER object
4. Filtering list of countries from entities
5. Creating dataframe of relationships
___________________________________________________________________________________________

#### 1. Importing libraries and data

In [1]:
# importing libraries

import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import re
import os

  import pkg_resources


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load English module of NER

NER = spacy.load("en_core_web_sm")

In [4]:
# Load scraped Wikipedia page of 20th century key events:
with open('Key_events_20th_century_Wiki.txt', 'r', errors='ignore') as file:
    data = file.read().replace('\n', '')

#### 2. Creating NER object

In [5]:
# Creating wikipage object to store whole text
wikipage = NER(data)

In [6]:
# Visualize identified entities
displacy.render(wikipage[300:20000], style = "ent", jupyter = True)

#### 3. Splitting sentence entities from NER object

In [7]:
df_sentences = [] # create empty shell to store results

# Loop through sentences and get entity list for each sentence
for sent in wikipage.sents:
    entity_list = [ent.text for ent in sent.ents]
    df_sentences.append({"sentence": sent, "entities": entity_list})

df_sentences = pd.DataFrame(df_sentences)

In [8]:
df_sentences.head(15)

Unnamed: 0,sentence,entities
0,"(Key, events, of, the, 20th, century, -, Wikip...","[the 20th century - WikipediaJump, Contribute,..."
1,"(decolonization1.5.2The, Cold, War, (, 1947â€“...","[Cold War, 1947â€“1991)1.5.3War]"
2,"(race1.5.5The, end, of, the, Cold, War1.5.6Inf...","[the Cold War1.5.6Information, 20th]"
3,(linkPage),[]
4,(informationCite),[]
5,"(this, pageGet, shortened, URLDownload, QR, co...","[pageGet, URLDownload, Download, Wikipedia, en..."
6,"(The, World, Wars, sparked, tension, between, ...","[the Cold War, the Space Race]"
7,"(These, advancements, have, played, a, signifi...","[the 21st century, today]"
8,"(Historic, events, in, the, 20th, century[edit...",[the 20th]
9,"(Edwardian, eraThe, new, beginning, of, the, 2...","[Edwardian, the 20th century]"


#### 4. Filtering entities from the wikipage

In [9]:
# Creating country aliases dictionary
country_aliases = {"United States of America": ["USA", "U.S.A.", "America", "U.S.", "US", "United States", "American", "Americans"],
    "United Kingdom": ["Britain", "UK", "U.K.", "British"],
    "China": ["PRC", "People's Republic of China", "Chinese"],
    "Germany":["German", "Germans"],
    "Japan": ["Japanese"],
    "Russia": ["Russian Federation", "Soviet", "Soviet Union", "USSR", "U.S.S.R"],
    "France": ["French"],
    "Italy":["Italian", "Italians"]
}

In [10]:
# Load country base list

with open ("countries.txt", "r", encoding = "utf-8") as f:
    countries = [line.strip() for line in f]

# Merge base list with aliases
country_df = []
for country in countries:
    country_df.append(country)
    if country in country_aliases:
        country_df.extend(country_aliases[country])

print(country_df)

['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada', 'Central African Republic', 'Chad', 'Chile', 'China', 'PRC', "People's Republic of China", 'Chinese', 'Colombia', 'Comoros', 'Congo (Congo-Brazzaville)', 'Costa Rica', 'Croatia', 'Cuba', 'Cyprus', 'Czech Republic (Czechia)', 'Democratic Republic of the Congo', 'Denmark', 'Djibouti', 'Dominica', 'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiopia', 'Fiji', 'Finland', 'France', 'French', 'Gabon', 'Gambia', 'Georgia', 'Germany', 'German', 'Germans', 'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 

In [11]:
# Create dataframe

country_df = pd.DataFrame(country_df, columns = ['country_name'])

print(country_df.shape)
print(country_df.head())

(222, 1)
  country_name
0  Afghanistan
1      Albania
2      Algeria
3      Andorra
4       Angola


In [12]:
# Filtering out entities not of interest

def filter_entity(ent_list, country_df):
    return [ent for ent in ent_list
            if ent in list(country_df['country_name'])]

In [13]:
# Check
filter_entity(["Afghanistan", "CareerFoundry", "z760"], country_df)

['Afghanistan']

In [14]:
# Creating list comprehension of filtered entitites

df_sentences['country_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, country_df))

In [15]:
df_sentences['country_entities'].tail(15)

1239          []
1240          []
1241          []
1242          []
1243          []
1244          []
1245          []
1246          []
1247          []
1248          []
1249    [French]
1250          []
1251          []
1252          []
1253          []
Name: country_entities, dtype: object

In [16]:
# Filter out sentences that don't have any country entities

df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

In [17]:
df_sentences_filtered.head(15)

Unnamed: 0,sentence,entities,country_entities
17,"(After, a, period, of, diplomatic, and, milita...","[the July Crisis, the end of July 1914, Britis...","[France, German, Russia]"
18,"(The, Bolsheviks, negotiated, the, Treaty, of,...","[the Treaty of Brest-Litovsk, Germany, Russia]","[Germany, Russia]"
19,"(In, the, treaty, ,, Bolshevik, Russia, ceded,...","[Bolshevik Russia, Baltic, Germany, Kars Oblas...",[Germany]
20,"(It, also, recognized, the, independence, of, ...","[Germany, Allied, American]","[Germany, American]"
21,"(The, Germans, introduced, the, machine, gun,[...","[Germans, British]","[Germans, British]"
25,"(First, and, foremost, ,, the, Germans, were, ...","[First, Germans, the Treaty of Versailles, the...",[Germans]
26,"(Many, Germans, felt, these, reparations, were...","[Germans, Germany, Allied, Kaiser, Europe]","[Germans, Germany]"
44,"(The, rise, of, dictatorship[edit]Main, articl...","[DictatorshipNazi, Jewish, Germans]",[Germans]
46,"(Germany, ,, 1933Fascism, first, appeared, in,...","[Germany, first, Italy, Benito Mussolini, 1922...","[Germany, Italy]"
47,"(The, ideology, was, supported, by, a, large, ...","[Adolf Hitler, Germany, 1933, Nazism, Germany,...","[Germany, Germany, German]"


#### 5. Creating dataframe of relationships

In [18]:
# Defining relationships 

# window size = 5 : this defines how many sentences will be looked at simultaneously 
relationships = [] # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i+5, df_sentences_filtered.index[-1])
    cntr_list = sum((df_sentences_filtered.loc[i: end_i].country_entities), [])
    
    # Remove duplicated characters that are next to each other
    cntr_unique = [cntr_list[i] for i in range(len(cntr_list)) 
                   if (i==0) or cntr_list[i] != cntr_list[i-1]]
    
    if len(cntr_unique) > 1:
        for idx, a in enumerate(cntr_unique[:-1]):
            b = cntr_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [19]:
# Creating relationships dataframe

relationship_df = pd.DataFrame(relationships)

In [20]:
relationship_df

Unnamed: 0,source,target
0,France,German
1,German,Russia
2,France,German
3,German,Russia
4,Russia,Germany
...,...,...
1752,India,Singapore
1753,India,Singapore
1754,India,Singapore
1755,India,Singapore


In [21]:
# Sort the cases with a->b and b->a

relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df.head(5)

Unnamed: 0,source,target
0,France,German
1,German,Russia
2,France,German
3,German,Russia
4,Germany,Russia


In [22]:
# Creating set of aliases
all_aliases = set()
for aliases in country_aliases.values():
    all_aliases.update(aliases)

In [23]:
# Checking aliases in 'source'
aliases_in_source = set(relationship_df['source']).intersection(all_aliases)

# Checking aliases in 'target'
aliases_in_target = set(relationship_df['target']).intersection(all_aliases)

print("Aliases still in 'source' column:", aliases_in_source)
print("Aliases still in 'target' column:", aliases_in_target)

Aliases still in 'source' column: {'Chinese', 'British', 'Soviet Union', 'Italian', 'Soviet', 'French', 'American', 'Italians', 'Britain', 'Germans', 'Japanese', "People's Republic of China", 'U.S.', 'America', 'German', 'Americans'}
Aliases still in 'target' column: {'Chinese', 'British', 'US', 'Soviet Union', 'Italian', 'Soviet', 'French', 'Italians', 'Britain', 'Germans', 'USSR', 'Japanese', 'United States', 'U.S.', 'German', 'Americans', 'USA'}


In [24]:
# Checking if any aliases remain in either source or target columns of df

any_alias_in_source = any(x in all_aliases for x in relationship_df['source'])
any_alias_in_target = any(x in all_aliases for x in relationship_df['target'])

print(f"Aliases left in source? {any_alias_in_source}")
print(f"Aliases left in target? {any_alias_in_target}")

Aliases left in source? True
Aliases left in target? True


In [25]:
# Inverting alias dictionary

country_aliases = {"United States of America": ["USA", "U.S.A.", "America", "U.S.", "US", "United States", "American", "Americans"],
    "United Kingdom": ["Britain", "UK", "U.K.", "British"],
    "China": ["PRC", "People's Republic of China", "Chinese"],
    "Germany":["German", "Germans"],
    "Japan": ["Japanese"],
    "Russia": ["Russian Federation", "Soviet", "Soviet Union", "USSR", "U.S.S.R"],
    "France": ["French"],
    "Italy":["Italian", "Italians"]
}

alias_to_canonical = {}
for canonical, aliases in country_aliases.items():
    for alias in aliases:
        alias_to_canonical[alias] = canonical


In [26]:
# Defining function to replace aliases with canonical names

def normalize_country(name):
    # If name is an alias, return canonical name
    # Otherwise, return the original name
    return alias_to_canonical.get(name, name)

In [27]:
# Applying function to our relationships df

relationship_df['source'] = relationship_df['source'].apply(normalize_country)
relationship_df['target'] = relationship_df['target'].apply(normalize_country)

In [28]:
relationship_df.tail(20)

Unnamed: 0,source,target
1737,France,United States of America
1738,Philippines,United States of America
1739,France,United States of America
1740,Philippines,United States of America
1741,France,United States of America
1742,Philippines,United States of America
1743,France,United States of America
1744,Romania,Russia
1745,Romania,Russia
1746,Romania,Russia


In [29]:
# Check in 'source'
aliases_in_source = set(relationship_df['source']).intersection(all_aliases)

# Check in 'target'
aliases_in_target = set(relationship_df['target']).intersection(all_aliases)

print("Aliases still in 'source' column:", aliases_in_source)
print("Aliases still in 'target' column:", aliases_in_target)

Aliases still in 'source' column: set()
Aliases still in 'target' column: set()


In [30]:
# Checking if any aliases remain in either source or target columns of df

any_alias_in_source = any(x in all_aliases for x in relationship_df['source'])
any_alias_in_target = any(x in all_aliases for x in relationship_df['target'])

print(f"Aliases left in source? {any_alias_in_source}")
print(f"Aliases left in target? {any_alias_in_target}")

Aliases left in source? False
Aliases left in target? False


In [31]:
# Exporting dataframe

relationship_df.to_csv('country_relationships.csv')