# 20th Century NLP Relationships

## Contents:
1. Library imports
2. Load 20th Century Page
3. Getting country entities
4. Relationships
5. Exports

### Library Imports

In [2]:
import pandas as pd 
import numpy as np
import spacy
from spacy import displacy
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re

!python -m spacy download en_core-web_sm


[38;5;1m[x] No compatible package found for 'en_core-web_sm' (spaCy v3.7.5)[0m



In [3]:
# Load spacy English module

NER = spacy.load("en_core_web_sm")

### Load 20th Century Page

In [6]:
# Load the 20th Century Page

with open('20th_century.txt', 'r', errors='ignore') as file: 
   data = file.read().replace( '\n', ' ')

In [7]:
data



##### Observations:
1. â€“ seems to appear for all dates and words that have a hyphen -

In [18]:
# Reloading 20th Century Page

with open('20th_century.txt', 'r', errors='ignore') as file: 
    data = file.read().replace('â€“', '-').replace('\n', ' ') #fix the hyphens

In [19]:
data



In [20]:
# Saving corrected 20th Century Page

with open('20th_century_cleaned.txt', 'w', encoding='utf-8') as file:
    file.write(data)

In [21]:
book = NER(data)

In [22]:
# Visualize identified entities

displacy.render(book[273:20000], style = "ent", jupyter = True)

### Get entities

In [23]:
df_sentences = []

# Loop through sentences, get entities labeled as GPE (Geopolitical Entity)

for sent in book.sents:
    entity_list = [ent.text for ent in sent.ents if ent.label_ == 'GPE']
    df_sentences.append({"sentence": str(sent), "entities": entity_list})

df_sentences = pd.DataFrame(df_sentences)

In [24]:
df_sentences.head(10)

Unnamed: 0,sentence,entities
0,The 20th century changed the world in unpreced...,[]
1,The World Wars sparked tension between countri...,[]
2,These advancements have played a significant r...,[]
3,Historic events in the 20th century[edit] Worl...,[]
4,The 1900s saw the decade herald a series of in...,[]
5,1914 saw the completion of the Panama Canal.,[]
6,"From 1914 to 1918, the First World War, and it...",[]
7,"""The war to end all wars"": World War I (1914-1...",[Sarajevo]
8,The war was precipitated by the Assassination ...,[Sarajevo]
9,After a period of diplomatic and military esca...,"[the British Empire, France, the Russian Empir..."


In [25]:
df_sentences.tail(10)

Unnamed: 0,sentence,entities
1762,Routledge.,[]
1763,p. 600.,[]
1764,ISBN 978-0-415-09311-8.,[]
1765,External links[edit],[]
1766,Wikimedia Commons has media related to Events ...,[]
1767,The 20th Century Research Project (archived 26...,[]
1768,Slouching Towards Utopia:,[]
1769,The Economic History of the Twentieth Century ...,[]
1770,TIME Archives The greatest writers of the 20th...,[]
1771,show vte Centuries and millennia show vte Hist...,[Centuries]


In [30]:
# Import df_countries

df_countries = pd.read_csv(r'C:\Users\walls\Documents\Coding\Data Analysis\CareerFoundry\Specialization\20th-Century\20th_century_countries.csv')

In [31]:
df_countries.head

<bound method NDFrame.head of      Unnamed: 0                          country_name
0             0                           Afghanistan
1             1                              Albania 
2             2                              Algeria 
3             3                              Andorra 
4             4                               Angola 
..          ...                                   ...
204         204     Sahrawi Arab Democratic Republic 
205         205                           Somaliland 
206         206                        South Ossetia 
207         207                               Taiwan 
208         208                          Transnistria

[209 rows x 2 columns]>

In [35]:
# Remove spacing from country names

df_countries['country_name'] = df_countries['country_name'].str.strip()

# Remove unnamed column 

df_countries.drop('Unnamed: 0', axis=1, inplace=True)

In [37]:
df_countries.head()

Unnamed: 0,country_name
0,Afghanistan
1,Albania
2,Algeria
3,Andorra
4,Angola


In [38]:
# Filtering for country entities

def filter_entity(ent_list, df_countries):
       return [ent for ent in ent_list
                  if ent in list(df_countries['country_name'])]

In [40]:
df_sentences['country_entities'] = df_sentences['entities'].apply(lambda x: filter_entity(x, df_countries))

In [42]:
df_sentences['country_entities'].head(20)

0                            []
1                            []
2                            []
3                            []
4                            []
5                            []
6                            []
7                            []
8                            []
9     [France, Austria, Russia]
10            [Germany, Russia]
11                    [Germany]
12                           []
13                    [Germany]
14                           []
15                           []
16                           []
17                           []
18                           []
19                           []
Name: country_entities, dtype: object

In [43]:
# Filtering entities with no characters 

df_sentences_filtered = df_sentences[df_sentences['country_entities'].map(len) > 0]

df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,country_entities
1182,"""The division of Germany - The Cold War (1945-...",[Germany],[Germany]
1216,"""The forgotten violence that helped India brea...",[India],[India]
1220,"""Indian Independence Day: everything you need ...","[India, Pakistan]","[India, Pakistan]"
1230,"""The Philippines, 1898-1946 | US House of Repr...",[Philippines],[Philippines]
1305,"The Moldovans: Romania, Russia, and the Politi...","[Romania, Russia]","[Romania, Russia]"
1370,"""Selling 'Operation Passage to Freedom': Dr. T...",[Vietnam],[Vietnam]
1405,"""Stuck in Endless Preliminaries: Vietnam and t...",[Vietnam],[Vietnam]
1702,"""Anti-American Behavior in the Middle East: Ev...",[Lebanon],[Lebanon]
1707,The Rise of China and India: A New Asian Drama.,"[China, India]",[India]
1708,Singapore: World Scientific.,[Singapore],[Singapore]


### Relationships

In [67]:
# Define window size
window_size = 5
relationships = []

# Loop through sentence indices
for i in range(df_sentences_filtered.index[-1]):
    end_i = min(i + window_size, df_sentences_filtered.index[-1])
    
    # Concatenate country entities in the window
    char_list = sum(df_sentences_filtered.loc[i:end_i, 'country_entities'], [])
    
    # Remove consecutive duplicates
    char_unique = [char_list[j] for j in range(len(char_list)) if (j == 0) or (char_list[j] != char_list[j-1])]
    
    # If more than one country, record relationships
    if len(char_unique) > 1:
        for idx, a in enumerate(char_unique[:-1]):
            b = char_unique[idx + 1]
            relationships.append({"source": a, "target": b})

In [68]:
df_relationships = pd.DataFrame(relationships)

In [69]:
df_relationships.head()

Unnamed: 0,source,target
0,France,Austria
1,Austria,Russia
2,France,Austria
3,Austria,Russia
4,Russia,Germany


In [70]:
# Create new df where source and target are in a consistent order

df_relationships = pd.DataFrame(np.sort(df_relationships.values, axis = 1), columns = df_relationships.columns)
df_relationships.head(5)

Unnamed: 0,source,target
0,Austria,France
1,Austria,Russia
2,Austria,France
3,Austria,Russia
4,Germany,Russia


In [71]:
# Check how many times source and target interact

df_relationships["value"] = 1 # Create new column
df_relationships= df_relationships.groupby(["source","target"], sort=False, as_index=False).sum() # group df 

In [74]:
df_relationships.head(10)

Unnamed: 0,source,target,value
0,Austria,France,6
1,Austria,Russia,6
2,Germany,Russia,21
3,Germany,Italy,22
4,Austria,Germany,10
5,Germany,Spain,1
6,France,Poland,11
7,France,Germany,29
8,Germany,Poland,26
9,Estonia,Germany,5


### Exports

In [76]:
# Export relationship dataframe

df_relationships.to_csv('20th_century_relationships.csv')

In [77]:
# Export cleaned countries df

df_countries.to_csv('df_countries_cleaned.csv')