### 1.6-Intro to NLP and Network Analysis

In [4]:
# Importing libraries

import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

In [5]:
# Downloading English module

!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m32.9 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
# Loading spacy English module

NER = spacy.load("en_core_web_sm")

#### Import 20th century text

In [9]:
file_path = '/Users/vickyczada/Library/CloudStorage/OneDrive-Personal/Documents/Career Foundry/Data Visualisation with Python/20th-century/20th_Century_Wiki_2.txt'

In [10]:
# Import txt file

try:
    with open(file_path, 'r', errors='ignore') as file:
        data = file.read().replace('\n', ' ')
    print("File successfully read.")
except FileNotFoundError:
    print(f"File not found: {file_path}")

File successfully read.


In [11]:
book = NER(data)

In [18]:
# First, make sure spaCy is installed and imported
import spacy

# If you're working with a specific language model, load it
nlp = spacy.load("en_core_web_sm")  # or another model you're using

# If 'book' is your text variable, process it with spaCy
# If 'book' is already a spaCy Doc object, you can skip this step
if not isinstance(book, spacy.tokens.doc.Doc):
    book = nlp(book)

# Alternative way to visualize entities without using displacy.render
# Option 1: Use spaCy's to_json method and print entities
entities = []
for ent in book[273:20000].ents:
    entities.append({
        'text': ent.text,
        'start': ent.start_char,
        'end': ent.end_char,
        'label': ent.label_
    })
print(entities)



#### Import list of countries

In [22]:
# Importing country names from CSV
csv_file_path = '/Users/vickyczada/Library/CloudStorage/OneDrive-Personal/Documents/Career Foundry/Data Visualisation with Python/20th-century/countries_list_20th_century_1.5.csv'  
countries_df = pd.read_csv(csv_file_path, encoding='latin1')

#### Identifying if the names of the countries in list are the same as the names in the text.

Obtaining a comparison between the country names in text (extracted_countries) and csv (country_list) by creating separate lists and then converting to separate sets.

In [23]:
# Column with country names in csv is named 'country_name'
country_list = countries_df['country_name'].tolist()


# Extracting named entities of type 'GPE' (Geopolitical Entity) from text
extracted_countries = [ent.text for ent in book.ents if ent.label_ == 'GPE']

# Removing duplicates and sorting the list
extracted_countries = sorted(set(extracted_countries))

# Converting both lists to sets for comparison
country_set = set(country_list)
extracted_country_set = set(extracted_countries)

# Finding countries in the CSV that are not in the extracted countries
missing_from_text = country_set - extracted_country_set

# Finding countries in the text that are not in the CSV
extra_in_text = extracted_country_set - country_set

print("Countries in CSV but not in text:", missing_from_text)
print("Countries in text but not in CSV:", extra_in_text)

Countries in CSV but not in text: {'   Grenada ', '  Uganda ', '  Albania ', '   Mongolia ', '  Angola ', '   South Sudan ', '   Japan ', '  Bolivia ', '  Australia ', '  Congo, Republic of the ', '   Nigeria ', '   Turkmenistan ', '   Mauritania ', '   Singapore ', '  Bahrain ', '  Denmark ', '   Luxembourg', '    Vatican City (Holy See) ', '   Norway', '   Philippines ', '   Ivory Coast', '   Seychelles ', '  Armenia ', '  Burkina Faso ', '  Algeria ', '   Kenya ', '   North Macedonia ', '   Palestine ', '   Saint Vincent and the Grenadines ', '  Yemen', '   Saudi Arabia ', '   Libya ', '  Fiji ', '   Marshall Islands ', '   Sahrawi Arab Democratic Republic ', '  Andorra ', '   France', '  Burundi', '   Latvia ', '   Papua New Guinea ', '   Gambia, The ', '   Kuwait ', '   Sudan ', '   Nicaragua ', '  Tajikistan ', '   Sri Lanka ', '  Brunei ', '  Azerbaijan', '   Transnistria', '  Central African Republic ', '  Gabon ', '   South Ossetia ', '   San Marino ', '   Jordan', '   Iraq ',

## After comparing the country names extracted from the text with those in the CSV file, I noticed the following:
#### Some country names in the CSV file had leading or trailing spaces that were not present in the text. Examples include "Samoa", "Slovenia", and "Vietnam".
#### To ensure a consistent match between entities in the text and those in the CSV, I standardized the naming conventions by removing these leading and trailing spaces from the CSV entries.
#### Additionally, some entities in the text, such as "Tayyab" and "Buchenwald", did not correspond to any country names in the CSV file.
#### My aim was to align the names in both the text and the CSV file to facilitate accurate matching of entities.

In [25]:
# Normalising the country names by removing whitespace and converting to lowercase

normalized_country_list = [country.strip().lower() for country in countries_df['country_name']]
country_set = set(normalized_country_list)

### Getting named entity list per sentence

In [26]:
# Extracting sentences and entities from the book
df_sentences = []  # empty shell to store results
# Loop through sentences, get entity list for each sentence
for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents if ent.label_ == 'GPE']
    df_sentences.append({"sentence": sent.text, "entities": entity_list})

In [27]:
df_sentences = pd.DataFrame(df_sentences)

### Filtering entries from the text

In [28]:
# Function to filter entities that are not of interest

def filter_entity(ent_list, country_set):
    filtered_entities = []
    for ent in ent_list:
        normalized_ent = ent.strip().lower()
        if normalized_ent in country_set:
            filtered_entities.append(ent)  # Append original entity without lowercase conversion
    return filtered_entities

In [29]:
# Applying the function to the 'entities' column

df_sentences['countries_entities'] = df_sentences['entities'].apply(lambda x: filter_entity([ent.strip().lower() for ent in x], country_set))

In [30]:
# Converting the first letter of each entity in 'countries_entities' to uppercase

df_sentences['countries_entities'] = df_sentences['countries_entities'].apply(lambda x: [ent.capitalize() for ent in x])

In [31]:
# Checking output for all sentences

print("Entities in df_sentences:")
print(df_sentences['entities'].head(20))

Entities in df_sentences:
0             []
1             []
2             []
3             []
4             []
5             []
6             []
7             []
8             []
9             []
10            []
11    [Download]
12            []
13            []
14            []
15            []
16            []
17            []
18            []
19            []
Name: entities, dtype: object


In [32]:
# Print the result for crosscheck

print("Filtered country entities with first letter uppercase:")
print(df_sentences['countries_entities'].head(30))

Filtered country entities with first letter uppercase:
0                    []
1                    []
2                    []
3                    []
4                    []
5                    []
6                    []
7                    []
8                    []
9                    []
10                   []
11                   []
12                   []
13                   []
14                   []
15                   []
16                   []
17                   []
18                   []
19                   []
20                   []
21                   []
22    [France, Austria]
23             [Russia]
24    [Germany, Russia]
25            [Germany]
26            [Germany]
27                   []
28                   []
29                   []
Name: countries_entities, dtype: object


In [33]:
# Filtering out sentences that don't have any character entities

df_sentences_filtered = df_sentences[df_sentences['countries_entities'].map(len) > 0]

In [34]:
df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,countries_entities
1112,"""The forgotten violence that helped India brea...",[India],[India]
1117,"""Indian Independence Day: everything you need ...",[Pakistan],[Pakistan]
1126,"^ ""The Philippines, 1898–1946 | US House of Re...",[Philippines],[Philippines]
1157,"""Colonial Cartographies, Postcolonial Borders,...",[Afghanistan],[Afghanistan]
1195,"The Moldovans: Romania, Russia, and the Politi...","[Romania, Russia]","[Romania, Russia]"
1258,"""Selling 'Operation Passage to Freedom': Dr. T...",[Vietnam],[Vietnam]
1290,"""Stuck in Endless Preliminaries: Vietnam and t...",[Vietnam],[Vietnam]
1563,"""Anti-American Behavior in the Middle East: Ev...",[Lebanon],[Lebanon]
1569,The Rise of China and India: A New Asian Drama.,[India],[India]
1570,Singapore: World Scientific. doi:10.1142/7381.,[Singapore],[Singapore]


#### Create Relationships

In [35]:
# Defining relationships 

# window size = 5 : this defines how many sentences will be looked at simultaneously 
relationships = [] # create an empty list

for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i+5, df_sentences_filtered.index[-1])
    char_list = sum((df_sentences_filtered.loc[i: end_i].countries_entities), [])
    
    # Remove duplicated countries that are next to each other
    char_unique = [char_list[i] for i in range(len(char_list)) 
                   if (i==0) or char_list[i] != char_list[i-1]]
    
    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [36]:
relationship_df = pd.DataFrame(relationships)

In [37]:
relationship_df

Unnamed: 0,source,target
0,France,Austria
1,France,Austria
2,Austria,Russia
3,France,Austria
4,Austria,Russia
...,...,...
558,India,Singapore
559,India,Singapore
560,India,Singapore
561,India,Singapore


In [38]:
# Sorting the cases with a->b and b->a

relationship_df = pd.DataFrame(np.sort(relationship_df.values, axis = 1), columns = relationship_df.columns)
relationship_df.head(5)

Unnamed: 0,source,target
0,Austria,France
1,Austria,France
2,Austria,Russia
3,Austria,France
4,Austria,Russia


In [39]:
# Summarising the interactions 

relationship_df["value"] = 1
relationship_df = relationship_df.groupby(["source","target"], sort=False, as_index=False).sum()

In [40]:
relationship_df.head(10)

Unnamed: 0,source,target,value
0,Austria,France,6
1,Austria,Russia,5
2,Germany,Russia,21
3,Germany,Italy,25
4,Austria,Germany,10
5,Germany,Spain,2
6,France,Poland,15
7,France,Germany,17
8,Germany,Poland,23
9,Estonia,Germany,5


In [41]:
relationship_df.to_csv('20th_century_relationship.csv')