In [1]:
%pip install geonamescache pycountry



In [2]:
#Using Google Collab so we can use the Collab Pro subcription to run faster the calculation
from google.colab import drive
drive.mount('/content/drive')
PATH = "/content/drive/My Drive/Deeplearning/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd

RAW = "Output/all_headlines.csv"
df_combined = pd.read_csv(PATH + RAW)

Clean & parse dates

In [4]:
import re

def clean_text(text: str) -> str:
    text = re.sub(r'\s+', ' ', str(text)).strip()  # collapse multiple spaces, newlines, tabs, etc.
    return text

# Clean headlines and descriptions
df_combined['Headlines'] = df_combined['Headlines'].map(clean_text)
df_combined['Description'] = df_combined['Description'].map(clean_text)

df_combined.to_csv(PATH+'Output/data clean text.csv', index=False)

Please do the following in the terminal

----
--> (re‑install spaCy with its transformer extras, in case they were skipped) \
pip install -U "spacy[transformers]"

--> download the 540 MB English transformer pipeline \
python -m spacy download en_core_web_lg


In [8]:
#!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Load spaCy transformer model

In [5]:
import spacy

nlp = spacy.load("en_core_web_lg")          # uses a RoBERTa encoder under the hood
#nlp.max_length = 20000                        # just in case


geo‑resolver + Function: extract unique countries mentioned in a headline

In [8]:
import spacy
import pycountry
import pandas as pd

# Load the worldcities data and filter for primary capitals
worldcities = pd.read_excel(PATH+'simplemaps_worldcities_basicv1.77/worldcities.xlsx')
primary_capitals = worldcities[worldcities['capital'] == 'primary']

# Create the city2country dictionary using primary capitals
city2country = dict(zip(primary_capitals['city_ascii'].str.lower(), primary_capitals['iso3']))

# Leader mapping
leader2country = {
    "donald trump": "USA",
    "trump": "USA",
    "xi jinping": "CHN",
    "xi": "CHN",
    "jinping": "CHN",
    "shinzō abe": "JPN",
    "shinzo abe": "JPN",
    "abe": "JPN",
    "angela merkel": "DEU",
    "merkel": "DEU",
    "emmanuel macron": "FRA",
    "macron": "FRA",
    "theresa may": "GBR",
    "boris johnson": "GBR",
    "narendra modi": "IND",
    "modi": "IND",
    "vladimir putin": "RUS",
    "putin": "RUS",
    "justin trudeau": "CAN",
    "trudeau": "CAN",
    "moon jae-in": "KOR",
    "jae-in": "KOR"
}

# quick helper to convert ISO alpha‑3 to full country name
def iso3name(iso):
    try:
        return pycountry.countries.get(alpha_3=iso).name  # Use alpha_3 instead of alpha_2
    except:
        return None

# Updated get_countries function
def get_countries(text: str) -> list[str]:
    doc = nlp(text)
    out = []

    text_lower = text.lower()
    # Check for political leaders manually
    for name, code in leader2country.items():
        if name in text_lower:
            out.append(code)

    for ent in doc.ents:
        if ent.label_ == "GPE":
            token = ent.text.lower()

            # Special mapping
            if token in ["us", "u.s.", "u.s"]:
                out.append("USA")
                continue
            if token in ["uk"]:
                out.append("GBR")
                continue

            # Prioritize your mapping for known cities
            if token in city2country:
                country_code = city2country[token]
                if iso3name(country_code):
                    out.append(country_code)
            # Defer to spaCy for other entities
            else:
                try:
                    code = pycountry.countries.lookup(token).alpha_3
                    out.append(code)
                except LookupError:
                    pass

    return list(set(out))

# Test cases
print(get_countries("Washington warns Beijing on chip export controls"))
print(get_countries("Paris ask Tokyo to buy planes"))
print(get_countries("Governor of Canberra asked appointment with France Prime Minister"))
print(get_countries("New Delhi  want to buy water from Amsterdam"))
print(get_countries("U.S. imposes new sanctions on Moscow"))
print(get_countries("UK and Germany discuss energy transition"))
print(get_countries("Trump and Macron spoke about China"))
print(get_countries("Putin and Abe agreed on new deals"))


['USA', 'CHN']
['FRA', 'JPN']
['FRA', 'AUS']
['NLD', 'IND']
['USA', 'RUS']
['GBR', 'DEU']
['USA', 'FRA', 'CHN']
['RUS', 'JPN']


Apply extractor

In [10]:
# Charger le dataset nettoyé
dev = pd.read_csv(PATH+'Output/data clean text.csv')

In [11]:
# Nettoyer les descriptions (au cas où)
def clean_text(text: str) -> str:
    return re.sub(r'\s+', ' ', str(text)).strip()

dev['Description'] = dev['Description'].map(clean_text)

In [15]:
from tqdm import tqdm
# Appliquer la détection de pays
tqdm.pandas()
dev["country_list"] = dev["Description"].progress_apply(get_countries)

# Compter combien de pays ont été détectés
dev["n_countries"] = dev["country_list"].str.len()

# Affichage au format demandé
print("\nn_countries")
print(dev["n_countries"].value_counts().sort_index())

# Exporter le DataFrame enrichi pour éviter de rerun les pays à chaque fois
dev.to_csv(PATH+'Output/headlines_with_countries.csv', index=False)

print("✅ Export terminé vers: Data/headlines_with_countries.csv")


100%|██████████| 35850/35850 [05:21<00:00, 111.37it/s]



n_countries
n_countries
0    18478
1    12683
2     3949
3      619
4      104
5       10
6        4
7        2
8        1
Name: count, dtype: int64
✅ Export terminé vers: Data/headlines_with_countries.csv


Sanity check


In [16]:
dev["n_countries"] = dev["country_list"].str.len()
print(dev["n_countries"].value_counts().head())


n_countries
0    18478
1    12683
2     3949
3      619
4      104
Name: count, dtype: int64


**Ligne avec 1 pays**

In [17]:
import pandas as pd

# 1. Charger le fichier enrichi
dev = pd.read_csv(PATH+'Output/headlines_with_countries.csv')

# 2. Convertir country_list en liste réelle si elle est stringifiée
import ast
dev['country_list'] = dev['country_list'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# 3. Recalculer n_countries au cas où
dev['n_countries'] = dev['country_list'].str.len()

# 4. Filtrer pour n_countries == 1
dev_single = dev[dev['n_countries'] == 1].copy()

# 5. Extraire le pays unique
def extract_single_country(countries):
    return countries[0] if isinstance(countries, list) and len(countries) == 1 else None

dev_single['country'] = dev_single['country_list'].apply(extract_single_country)

# 6. Garder les colonnes importantes
dev_single = dev_single[['date', 'Headlines', 'Description', 'source', 'country']]

# 7. Aperçu
print(dev_single.head())

# 8. Export
dev_single.to_csv(PATH+'Output/articles_with_one_country.csv', index=False)

# 9. Top 5 des pays
top_countries = dev_single['country'].value_counts().head(5)
print("\nTop 5 des pays les plus représentés (n_countries == 1) :")
print(top_countries)


           date                                          Headlines  \
22   2018-01-04  Cramer: The US-China 'trade war' could explode...   
61   2018-01-12  Cramer: 'This time it's different' can actuall...   
71   2018-01-17  Cramer calls Apple's $350 billion investment i...   
117  2018-01-30  Top steel CEO says US 'desperately' needs new ...   
126  2018-02-01  With gasoline exports on the rise, refinery CE...   

                                           Description source country  
22   Jim Cramer made the case for President Donald ...   CNBC     USA  
61   Jim Cramer said the success of Facebook, Amazo...   CNBC     JPN  
71   Jim Cramer interviewed Apple CEO Tim Cook afte...   CNBC     USA  
117  Jim Cramer speaks to Nucor Chairman and CEO Jo...   CNBC     USA  
126  Jim Cramer sits down with Marathon Petroleum C...   CNBC     USA  

Top 5 des pays les plus représentés (n_countries == 1) :
country
USA    6335
CHN    2316
JPN     716
DEU     401
IND     292
Name: count, dtype: i

**Ligne avec 2 pays**

In [18]:
import pandas as pd
from itertools import combinations
import ast

# 1. Charger le fichier enrichi
dev = pd.read_csv(PATH+'Output/headlines_with_countries.csv')

# 2. Convertir country_list en liste réelle si elle est stringifiée
dev['country_list'] = dev['country_list'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# 3. Recalculer n_countries au cas où
dev['n_countries'] = dev['country_list'].str.len()

# 4. Filtrer les lignes avec exactement deux pays
dev_pairs = dev[dev['n_countries'] == 2].copy()

# 5. Générer les paires de pays triées (ex: FRA-USA)
def generate_country_pair(countries: list[str]) -> str:
    return '-'.join(sorted(countries))

dev_pairs['country_pair'] = dev_pairs['country_list'].apply(generate_country_pair)

# 6. Garder les colonnes principales
dev_pairs = dev_pairs[['date', 'Headlines', 'Description', 'source', 'country_pair']]

# 7. Aperçu
print(dev_pairs.head())

# 8. Export vers CSV
dev_pairs.to_csv(PATH+'Output/paired_countries.csv', index=False)

# 9. Top 5 des paires
top_pairs = dev_pairs['country_pair'].value_counts().head(5)
print("\nTop 5 des paires de pays les plus représentées :")
print(top_pairs)


           date                                          Headlines  \
50   2018-01-10  Cramer makes the bull case for China's move to...   
208  2018-03-20  Cambridge Analytica played key Trump campaign ...   
214  2018-03-20  Amazon is now second most valuable U.S.-listed...   
228  2018-03-20  New Zealand court rejects Megaupload founder's...   
230  2018-03-20  France's Le Maire blames 'unfair trade' for gl...   

                                           Description   source country_pair  
50   Jim Cramer laid out the benefits of China pote...     CNBC      CHN-USA  
208  The suspended chief executive of UK-based poli...  Reuters      GBR-USA  
214  Amazon.com became the second most valuable pub...  Reuters      JPN-USA  
228  A New Zealand court rejected on Wednesday inte...  Reuters      NZL-USA  
230  Unfair trade and an overproduction of steel ar...  Reuters      ARG-FRA  

Top 5 des paires de pays les plus représentées :
country_pair
CHN-USA    1767
JPN-USA     192
CHN-MEX   