In [48]:
import spacy
from spacy import displacy
#!python -m spacy download en_core-web_sm
import pandas as pd 
import numpy as np
import networkx as nx
import os
import matplotlib.pyplot as plt
import scipy
import re


In [3]:
NER = spacy.load("en_core_web_sm")


In [4]:
with open('historic_events_contents.txt', 'r', errors='ignore') as file: 
   data = file.read().replace( '\n', ' ')



In [7]:
book = NER(data)
displacy.render(book[273:20000], style = "ent", jupyter = True)

In [8]:
# Evaluate whether the text needs wrangling—are there any special characters used? 
# Are the names of the countries in your list the same as the names in the text? 
# Write down your observations in a markdown cell and take the necessary steps to correct any issues you’ve found. 
# If anything does need correcting, make sure you save your file as a .txt.


In [9]:
# Special Characters: The text includes quotation marks (""), which might need cleaning depending on your processing needs.
# There is standard punctuation such as commas, periods, and parentheses, which might also be subject to removal
# Most of the country names in the text match the country names themselves (e.g., "Germany", "France", "Russia"). However, there are some differences:
# Serbian: "Serbia" is referred to as "Serbian" and tagged as NORP instead of GPE.

In [12]:
df_sentences = []

# Loop through sentences, get entity list for each sentence
for sent in book.sents:
       entity_list = [ent.text for ent in sent.ents]
       df_sentences.append({"sentence": sent, "entities": entity_list})

df_sentences = pd.DataFrame(df_sentences)


In [13]:
df_sentences.head(10)


Unnamed: 0,sentence,entities
0,"(Historic, Key, Events, "", The, 20th, century,...","[Historic Key, The 20th century]"
1,"(The, World, Wars, sparked, tension, between, ...","[the Cold War, the Space Race, the World Wide ..."
2,"(These, advancements, have, played, a, signifi...","[the 21st century, today]"
3,"(Historic, events, in, the, 20th, century[edit...",[20th]
4,"(The, world, at, the, beginning, of, the, cent...",[the 20th century]
5,"(The, 1900s, saw, the, decade, herald, a, seri...","[The 1900s, the decade]"
6,"(From, 1914, to, 1918, ,, the, First, World, W...","[1914 to 1918, the First World War]"
7,"("", "", The, war, to, end, all, wars, "", "", :, ...",[World War I]
8,"(Main, article, :, World, War, I, Arrest, of, ...","[Main, World War I Arrest, Sarajevo, the Assas..."
9,"(The, war, and, by, extension, the, century, a...","[the century, Sarajevo, the Austro-Hungarian E..."


In [25]:
country_df = pd.read_csv('../documents/workspace/Countries-Occurences.txt', sep='\t')


In [28]:
def filter_entity(ent_list, country_df):
       return [ent for ent in ent_list
                  if ent in list(country_df['Country'])]
print(country_df.columns)


Index(['Country', 'Occurrences'], dtype='object')


In [29]:
df_sentences['Country'] = df_sentences['entities'].apply(lambda x: filter_entity(x, country_df))


In [30]:
df_sentences_filtered = df_sentences[df_sentences['Country'].map(len) > 0]

df_sentences_filtered.tail(10)

Unnamed: 0,sentence,entities,Country
1126,"(^, "", "", Major, milestones, of, Iran, 's, nuc...","[Iran, Iran, Al Jazeera]","[Iran, Iran]"
1136,"(^, "", "", The, division, of, Germany, -, The, ...","[Germany, The Cold War, 1945–1989]",[Germany]
1165,"("", "", The, forgotten, violence, that, helped,...",[India],[India]
1169,"("", "", Indian, Independence, Day, :, everythin...","[Indian Independence Day: everything you, Part...","[India, Pakistan]"
1246,"(The, Moldovans, :, Romania, ,, Russia, ,, and...","[Moldovans, Romania, Russia, the Politics of C...","[Romania, Russia]"
1259,"(Now, ,, North, Korea, may, be, the, one, true...","[North Korea, one]",[North Korea]
1309,"("", "", Selling, "", "", Operation, Passage, to, ...","[Operation Passage to Freedom, Thomas Dooley, ...",[Vietnam]
1342,"("", "", Stuck, in, Endless, Preliminaries, :, V...","[Stuck in Endless Preliminaries:, Vietnam, the...",[Vietnam]
1602,"(The, Rise, of, China, and, India, :, A, New, ...","[China, India, New Asian Drama]",[India]
1603,"(Singapore, :, World, Scientific, .)","[Singapore, World Scientific]",[Singapore]


In [45]:
# Defining relationships

# window size = 5 : this defines how many sentences will be looked at simultaneously
relationships = [] # create an empty list

for i in range(df_sentences_filtered.index[-1]):
     end_i = min(i+5, df_sentences_filtered.index[-1])
     char_list = sum((df_sentences_filtered.loc[i: end_i].Country), [])

     # Remove duplicated characters that are next to each other
     char_unique = [char_list[i] for i in range(len(char_list))
                             if (i==0) or char_list[i] != char_list[i-1]]

     if len(char_unique) > 1:
        for idx, a in enumerate( char_unique[ :-1]):
          b = char_unique[idx + 1]
          relationships.append({"source": a, " target": b})

In [46]:
relationship_df = pd.DataFrame(relationships)

relationship_df

Unnamed: 0,source,target
0,France,Russia
1,France,Russia
2,Russia,Germany
3,Germany,Bulgaria
4,France,Russia
...,...,...
575,India,Singapore
576,India,Singapore
577,India,Singapore
578,India,Singapore


In [47]:
relationship_df.to_csv('../documents/workspace/Relationship_Dataframe.txt', sep='\t', index=False)
