In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
!pip install bertopic -q

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import re

In [None]:
from pprint import pprint as pp

# DATA:

In [None]:
data_path = 'drive/My Drive/DATA/UN_debates/'

In [None]:
df = pd.read_csv(data_path + 'un-general-debates.csv', encoding = 'utf-8-sig')
df.head()

Unnamed: 0,session,year,country,text
115,44,1989,IND,﻿\nMy delegation warmly welcomes the assumptio...
169,25,1970,IND,"40.\t Mr. President, I offer you our congratul..."
247,68,2013,IND,"Let me first of all congratulate \nyou, Mr. Pr..."
434,40,1985,IND,"Sir, I should like to begin by congratulating ..."
565,63,2008,IND,"May I first congratulate you, \nSir, on your e..."


In [None]:
dff = (df
 [df.year >= 2010]
 [df.country == 'IND']
 .reset_index(drop = True)
 )

dff

  dff = (df


Unnamed: 0,session,year,country,text
0,68,2013,IND,"Let me first of all congratulate \nyou, Mr. Pr..."
1,67,2012,IND,﻿It is indeed a great privilege\nfor me to be ...
2,65,2010,IND,"Allow me to begin by \ncongratulating you, Sir..."
3,69,2014,IND,"At the outset, I would like \nto congratulate ..."
4,66,2011,IND,Allow me at the outset to \ncongratulate Mr. A...
5,70,2015,IND,The United Nations is marking its seventieth a...


### SAMPLE DATA:

In [None]:
def f_make_text_chunks(raw_txt):

    txt = raw_txt.replace('\ufeff', '')
    txt_chunks = re.split('\.\s*\n', txt)
    txt_chunks = [txt_chunk.replace('\n', '') for txt_chunk in txt_chunks]
    return txt_chunks


def f_chunk_docs(docs):

    txt_chunks = []
    for doc in docs:
        txt_chunks.extend(f_make_text_chunks(doc))
    return txt_chunks

In [None]:
idx = 5

raw_txt = dff.text.values[idx]
txt_chunks = f_make_text_chunks(raw_txt)

In [None]:
for i in txt_chunks[:5]:
    pp(i)
    print()

('The United Nations is marking its seventieth anniversary this year, making '
 'this session of the General Assembly a historic one. I hope that this year '
 'will also be historic for the United Nations in terms of outcomes. I would '
 'like to assure the President that he will receive India’s full support in '
 'his efforts')

('Seventy years ago, the foundations of the United Nations were laid at the '
 'San Francisco Conference, in a city on the west coast of this country. India '
 'was one of the signatory countries of the Charter of the United Nations, '
 'although at that time we were not independent. We obtained our independence '
 'two years later. When the United Nations was established, a rather '
 'diminutive-looking man with the powerful weapon of non-violence was writing '
 'the final act in a struggle that would become a symbol of hope for the '
 'colonized and the oppressed everywhere. I am grateful that the United '
 'Nations has proclaimed the birthday of this extrao

## UTILITY FUNCTIONS:

In [None]:
def f_get_all_topics(topic_model):

    df = (topic_model
            .get_topic_info()
            [['Topic', 'Representation']]
            .set_index('Topic'))

    return df


def f_get_topic_docs(topic_n, topic_model):

    df_topics = topic_model.get_document_info(txt_chunks)
    df_topics.columns = [i.lower() for i in df_topics.columns]

    df_representative = (df_topics
    [df_topics.topic == topic_n]
    [df_topics.representative_document == True]
    .sort_values(by = 'probability', ascending = False)
    .document)

    df_not_representative = (df_topics
    [df_topics.topic == topic_n]
    [df_topics.representative_document == False]
    .sort_values(by = 'probability', ascending = False)
    .document)

    df_topic_docs = (pd.concat
     ([df_representative, df_not_representative])
    .reset_index(drop = True))

    return df_topic_docs


def f_get_topic_words(topic_model, topic_n):
    d = f_get_all_topics(topic_model)
    words = ', '.join(d.loc[topic_n].values[0])
    return words


def f_show_topics(topic_model):
    d = f_get_all_topics(topic_model)
    for topic_n in d.index:
        print(f'TOPIC: {topic_n}')
        print('----------')
        words = f_get_topic_words(topic_model, topic_n)
        pp(words)
        print('')


def f_show_topic_docs(topic_n, topic_model, n_top_docs = 3):

    words = f_get_topic_words(topic_model, topic_n)

    pp(f'TOPIC: {topic_n}')
    pp(words)
    print()
    print('')

    print('SAMPLE DOCS:')
    print()

    for j in f_get_topic_docs(topic_n, topic_model)[:n_top_docs]:
        pp(j)
        print('------------')

# TOPIC MODELLING USING BERTOPIC:

In [None]:
%%time

from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

from umap import UMAP
from hdbscan import HDBSCAN

CPU times: user 10.9 s, sys: 691 ms, total: 11.6 s
Wall time: 11.8 s


### TOPIC  MODEL:

In [None]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=3,
                  n_components=10,
                  random_state=42,
                  min_dist=0.0,
                  metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=3, metric='euclidean',
                        cluster_selection_method='eom',
                        prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

# All steps together
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=ctfidf_model,
    representation_model=representation_model
)


%time topics, probs = topic_model.fit_transform(txt_chunks)

CPU times: user 11.3 s, sys: 1.7 s, total: 13 s
Wall time: 13 s


```
topic_model.push_to_hf_hub(
    repo_id="wizardofchance/NER_conllpp",
    save_ctfidf=True
)
```

## TOPICS IDENTIFIED:

In [None]:
f_show_topics(topic_model)

TOPIC: -1
----------
('sustainable, nations, revitalization, banyan, efforts, agreement, achieved, '
 'milestone, goals, humankind')

TOPIC: 0
----------
('peacekeeping, peacekeepers, nations, peace, nation, india, security, '
 'conflicts, council, conflict')

TOPIC: 1
----------
('terrorism, terrorists, terrorist, pakistan, menace, threat, disaster, '
 'crisis, india, nations')

TOPIC: 2
----------
('celebrating, solidarity, nations, africa, gandhi, anniversary, india, '
 'birthday, summit, conference')



## EXPLORING EACH TOPIC:

In [None]:
topic_n = -1

f_show_topic_docs(topic_n, topic_model)

'TOPIC: -1'
('sustainable, nations, revitalization, banyan, efforts, agreement, achieved, '
 'milestone, goals, humankind')


SAMPLE DOCS:

('The seventieth year of any human being’s life is a milestone from which to '
 'look back and reflect on what one has achieved and what one has lost. '
 'Similarly, for people associated with an institution, the seventieth year '
 'provides an opportunity to analyse whether the institution has fulfilled its '
 'purpose and achieved the goals set for it at its establishment. Today we '
 'need to ask ourselves whether we have fulfilled the purpose and achieved the '
 'goals for which the United Nations was established 70 years ago. When I ask '
 'myself that question, I receive an affirmative answer for some questions and '
 'a negative one for others. For example, the United Nations has been '
 'successful in preventing a third world war, assisting decolonization and '
 'dismantling apartheid. It has been successful in combating global epidemics, '

In [None]:
topic_n = 0

f_show_topic_docs(topic_n, topic_model)

'TOPIC: 0'
('peacekeeping, peacekeepers, nations, peace, nation, india, security, '
 'conflicts, council, conflict')


SAMPLE DOCS:

('Today, the world is ravaged by war on three continents, with the Security '
 'Council powerless or unwilling to stem the f low of blood. Traditional '
 'solutions that rely on force have only exacerbated problems. We must ask '
 'ourselves if we have the political will to craft alternatives to conflict '
 'and to pursue them with commitment and single-minded dedication. Nowhere is '
 'such a goal more important than in peacekeeping. Under the blue flag, men '
 'and women are constantly working to prevent conflict, protect civilians and '
 'sustain peace processes. With 180,000 peacekeepers deployed so far, India '
 'has been the largest contributor to international security provided by the '
 'United Nations. Even today, about 8,000 Indian military and police personnel '
 'are participating in 10 peacekeeping missions, operating in highly '
 'challengin

In [None]:
topic_n = 1

f_show_topic_docs(topic_n, topic_model)

'TOPIC: 1'
('terrorism, terrorists, terrorist, pakistan, menace, threat, disaster, '
 'crisis, india, nations')


SAMPLE DOCS:

('Equally important, the establishment of an international legal regime, under '
 'a comprehensive convention on international terrorism, can no longer be held '
 'up. Nineteen years ago, in 1996, India proposed such a regime at the United '
 'Nations, but the Member States have been unable to adopt it and have become '
 'mired in the issue of definition. We have to understand that no such '
 'distinction exists as good terrorists and bad terrorists, and that terrorism '
 'cannot be linked15-29658 43/55\x0c'
 'A/70/PV.22 01/10/2015to any religion. A terrorist is a terrorist, and anyone '
 'who commits crimes against humanity cannot have a religion. Therefore, I '
 'appeal to everyone to come together during this seventieth anniversary year '
 'of the United Nations and to pledge to unanimously adopt a comprehensive '
 'convention on international terrorism')
-

In [None]:
topic_n = 2

f_show_topic_docs(topic_n, topic_model)

'TOPIC: 2'
('celebrating, solidarity, nations, africa, gandhi, anniversary, india, '
 'birthday, summit, conference')


SAMPLE DOCS:

('The United Nations is marking its seventieth anniversary this year, making '
 'this session of the General Assembly a historic one. I hope that this year '
 'will also be historic for the United Nations in terms of outcomes. I would '
 'like to assure the President that he will receive India’s full support in '
 'his efforts')
------------
('Seventy years ago, the foundations of the United Nations were laid at the '
 'San Francisco Conference, in a city on the west coast of this country. India '
 'was one of the signatory countries of the Charter of the United Nations, '
 'although at that time we were not independent. We obtained our independence '
 'two years later. When the United Nations was established, a rather '
 'diminutive-looking man with the powerful weapon of non-violence was writing '
 'the final act in a struggle that would become a symbo

# CONCLUSIONS: