In [2]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
!pip install datasets

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [12]:
import numpy as np
import pandas as pd
import re

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## DATA:

In [6]:
data_path = 'drive/My Drive/DATA/UN_debates/'

In [7]:
df = pd.read_csv(data_path + 'un-general-debates.csv', encoding = 'utf-8-sig')

df.head()

Unnamed: 0,session,year,country,text
0,44,1989,MDV,﻿It is indeed a pleasure for me and the member...
1,44,1989,FIN,"﻿\nMay I begin by congratulating you. Sir, on ..."
2,44,1989,NER,"﻿\nMr. President, it is a particular pleasure ..."
3,44,1989,URY,﻿\nDuring the debate at the fortieth session o...
4,44,1989,ZWE,﻿I should like at the outset to express my del...


In [8]:
dff = (df
 [df.year >= 2010]
 [df.country == 'IND']
 .reset_index(drop = True)
 )

dff

  dff = (df


Unnamed: 0,session,year,country,text
0,68,2013,IND,"Let me first of all congratulate \nyou, Mr. Pr..."
1,67,2012,IND,﻿It is indeed a great privilege\nfor me to be ...
2,65,2010,IND,"Allow me to begin by \ncongratulating you, Sir..."
3,69,2014,IND,"At the outset, I would like \nto congratulate ..."
4,66,2011,IND,Allow me at the outset to \ncongratulate Mr. A...
5,70,2015,IND,The United Nations is marking its seventieth a...


In [10]:
def f_make_text_chunks(raw_txt):

    txt = raw_txt.replace('\ufeff', '')
    txt_chunks = re.split('\.\s*\n', txt)
    txt_chunks = [txt_chunk.replace('\n', '') for txt_chunk in txt_chunks]
    return txt_chunks


def f_chunk_docs(docs):

    txt_chunks = []
    for doc in docs:
        txt_chunks.extend(f_make_text_chunks(doc))
    return txt_chunks

In [14]:
idx = 5

raw_txt = dff.text.values[idx]
txt_chunks = f_make_text_chunks(raw_txt)

In [15]:
for i in txt_chunks[:5]:
    print(i)
    print()

The United Nations is marking its seventieth anniversary this year, making this session of the General Assembly a historic one. I hope that this year will also be historic for the United Nations in terms of outcomes. I would like to assure the President that he will receive India’s full support in his efforts

Seventy years ago, the foundations of the United Nations were laid at the San Francisco Conference, in a city on the west coast of this country. India was one of the signatory countries of the Charter of the United Nations, although at that time we were not independent. We obtained our independence two years later. When the United Nations was established, a rather diminutive-looking man with the powerful weapon of non-violence was writing the final act in a struggle that would become a symbol of hope for the colonized and the oppressed everywhere. I am grateful that the United Nations has proclaimed the birthday of this extraordinary man as the International Day of Non-Violence. 

### CREATING BOW/TFIDF VECTOR REPRESENTATIONS OF DOCS:

In [17]:
%%time

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), max_df=0.8, min_df=2)
bow_vectorizer = CountVectorizer(stop_words=stopwords.words('english'), max_df=0.8, min_df=2)

tfidf_vectors = tfidf_vectorizer.fit_transform(txt_chunks)
bow_vectors = bow_vectorizer.fit_transform(txt_chunks)

tfidf_vectors.shape, bow_vectors.shape

CPU times: user 9.26 ms, sys: 860 µs, total: 10.1 ms
Wall time: 24.1 ms


((26, 216), (26, 216))

In [24]:
def f_display_topics(model, vocab, no_top_words=7, n_components=8, max_iter=1000):

    try:
        text_model = model(n_components=n_components, max_iter=max_iter, random_state=7)
    except:
        text_model = model(n_components=n_components, n_iter=max_iter, random_state=7)
    doc_topic_matrix = text_model.fit_transform(tfidf_vectors)
    topic_word_matrix = text_model.components_

    for topic, topic_vec in enumerate(topic_word_matrix):
        topic_vec = abs(topic_vec)
        topic_vec = topic_vec/(topic_vec.sum())
        sorted_idx = topic_vec.argsort()[::-1]

        print(f'TOPIC {topic+1}:')
        print('-----------')

        words = []
        for i in range(no_top_words):
            word = vocab[sorted_idx[i]]
            words.append(word)
        print(words)
        print()
        print()

    return doc_topic_matrix


def f_display_docs(doc_topic_matrix, docs, topic, n_samples = 3, n_top = 10):

    cols = [i for i in range(1, doc_topic_matrix.shape[1]+ 1)]
    df_topics_docs = pd.DataFrame(doc_topic_matrix, columns = cols)

    print()
    print('----------------')
    top_sorted_idx = df_topics_docs[topic].argsort()[::-1][:n_top]
    idxs = sample(list(top_sorted_idx), n_samples)
    for i in idxs:
        print(docs[i])
        print()
        print('------------------')

### 1. NON NEGATIVE MATRIX FACTORIZATION:

In [19]:
from sklearn.decomposition import NMF

In [21]:
%%time

vocab = tfidf_vectorizer.get_feature_names_out()
nmf_doc_topic_matrix = f_display_topics(NMF, vocab)

TOPIC 1:
-----------
['united', 'nations', 'assembly', 'session', 'historic', 'peacekeepers', 'seventieth']


TOPIC 2:
-----------
['terrorism', 'international', 'terrorists', 'terrorist', 'pakistan', 'border', 'community']


TOPIC 3:
-----------
['east', 'region', 'reached', 'neighbourhood', 'also', 'major', 'summit']


TOPIC 4:
-----------
['development', 'goals', 'sustainable', 'government', 'change', 'prime', 'comprehensive']


TOPIC 5:
-----------
['global', 'efforts', 'action', 'support', 'weapon', 'state', 'non']


TOPIC 6:
-----------
['peacekeeping', 'operations', 'conflict', 'security', 'personnel', 'must', 'even']


TOPIC 7:
-----------
['banyan', 'tree', 'united', 'nations', '70', 'well', 'growing']


TOPIC 8:
-----------
['whether', 'answer', 'ask', 'conflicts', 'institution', 'response', 'peace']


CPU times: user 19.9 ms, sys: 4.02 ms, total: 24 ms
Wall time: 39.8 ms


In [25]:
topic = 2
f_display_docs(nmf_doc_topic_matrix, txt_chunks, topic)


----------------
While on the subject of terrorism, I take this opportunity to share the challenges that we face in our ties with Pakistan. None of us can accept that terrorism is a legitimate instrument of statecraft. The world shared our outrage at the 2008 Mumbai terror attacks, in which citizens of many nations were helplessly butchered. That the mastermind behind the attack is walking free is an affront to the entire international community. Not only have past assurances in this regard not been honoured, but new cross-border terrorist attacks have taken place recently, and two terrorists from across the border have been captured alive. We all know that these attacks are meant to destabilize India and legitimize Pakistan’s illegal occupation of parts of the Indian State of Jammu and Kashmir and its claim to the rest of it

------------------
Equally important, the establishment of an international legal regime, under a comprehensive convention on international terrorism, can no lo

In [27]:
topic = 3
f_display_docs(nmf_doc_topic_matrix, txt_chunks, topic)


----------------
We remain committed to the Middle East peace process, which is key to preventing further radicalization of the region. We continue to hope that asolution can be reached to the Palestinian question. We have also qualitatively upgraded our relations with all the major Powers

------------------
Natural disasters and man-made conflicts also deserve our coordinated response. In recent months, the international community has been found wanting in its response to the refugee crisis caused by the conflict situations in Syria, Iraq and Libya. What is needed is a swift response backed by political will. Our own response to humanitarian crises in our neighbourhood has been quick, responsive and holistic. Whether in Nepal or Yemen, India has emerged as a net security provider, assisting not only our nationals but those of other countries that sought our help. India will be hosting the first Asian Ministerial Conference on Disaster Risk Reduction in November 2016, when the topic 

### 2. SINGULAR VALUE DECOMPOSITION:

In [28]:
from sklearn.decomposition import TruncatedSVD

In [29]:
%%time

svd_doc_topic_matrix = f_display_topics(TruncatedSVD, vocab)

TOPIC 1:
-----------
['nations', 'united', 'terrorism', 'international', 'india', 'one', 'world']


TOPIC 2:
-----------
['united', 'terrorism', 'nations', 'banyan', 'international', 'tree', 'historic']


TOPIC 3:
-----------
['terrorism', 'east', 'region', 'reached', 'also', 'terrorists', 'neighbourhood']


TOPIC 4:
-----------
['peacekeeping', 'security', 'efforts', 'council', 'ask', 'well', 'peace']


TOPIC 5:
-----------
['east', 'global', 'terrorism', 'peacekeepers', 'region', 'ask', 'reached']


TOPIC 6:
-----------
['peacekeeping', 'tree', 'banyan', 'year', 'one', 'terrorism', 'well']


TOPIC 7:
-----------
['efforts', 'global', 'action', 'tree', 'well', 'banyan', 'peacekeepers']


TOPIC 8:
-----------
['peacekeeping', 'whether', 'answer', 'response', 'ask', 'conflicts', 'operations']


CPU times: user 196 ms, sys: 0 ns, total: 196 ms
Wall time: 199 ms


In [32]:
topic = 1
f_display_docs(svd_doc_topic_matrix, txt_chunks, topic)


----------------
At 70, unlike the men and women who might have planted its seed, the banyan is still young. But without constant regeneration and expansion, it withers away. That could be the fate that awaits the United Nations. Either we can take this historic opportunity to renew this critical institution or we can condemn it to irrelevance and tragic withering. We could easily lose ourselves in a labyrinth of our own creation, but if we seize the day, we will see a United Nations growing to its full potential, a United Nations that can reduce the gap between what it does and what it is capable of doing, a mighty banyan tree providing the canopy for a peaceful and prosperous humankind. I end my statement with the hope that the United Nations will grow into such a banyan tree

------------------
The seventieth year of any human being’s life is a milestone from which to look back and reflect on what one has achieved and what one has lost. Similarly, for people associated with an inst

In [35]:
topic = 2
f_display_docs(svd_doc_topic_matrix, txt_chunks, topic)


----------------
As we mark the seventieth anniversary of the United Nations, I take this opportunity to pay tribute to the more than 3,300 peacekeepers, including 161 from India, who have made the ultimate sacrifice. We stand ready to contribute to the memorial wall for United Nations peacekeepers, which was approved by the General Assembly at its sixty-ninth session

------------------
Seventy years ago, the foundations of the United Nations were laid at the San Francisco Conference, in a city on the west coast of this country. India was one of the signatory countries of the Charter of the United Nations, although at that time we were not independent. We obtained our independence two years later. When the United Nations was established, a rather diminutive-looking man with the powerful weapon of non-violence was writing the final act in a struggle that would become a symbol of hope for the colonized and the oppressed everywhere. I am grateful that the United Nations has proclaimed t

### 3. LATENT DIRICHLET ALLOCATION:

In [36]:
from sklearn.decomposition import LatentDirichletAllocation

In [37]:
%%time

lda_doc_topic_matrix = f_display_topics(LatentDirichletAllocation, vocab)

TOPIC 1:
-----------
['united', 'nations', 'banyan', 'international', 'terrorists', 'action', 'must']


TOPIC 2:
-----------
['weapon', 'state', 'non', 'global', 'support', 'conflict', 'peacekeeping']


TOPIC 3:
-----------
['terrorism', 'us', 'pakistan', 'border', 'well', 'regard', 'institution']


TOPIC 4:
-----------
['response', 'peacekeepers', 'india', 'made', 'united', 'nations', 'renewal']


TOPIC 5:
-----------
['nations', 'terrorism', 'security', 'ask', 'united', 'year', 'would']


TOPIC 6:
-----------
['peacekeeping', 'africa', 'leaders', 'operations', 'summit', 'time', 'countries']


TOPIC 7:
-----------
['development', 'neighbourhood', 'east', 'policy', 'banyan', '70', 'sustainable']


TOPIC 8:
-----------
['efforts', 'preventing', 'reached', 'major', 'east', 'also', 'question']


CPU times: user 1.66 s, sys: 4.9 ms, total: 1.66 s
Wall time: 1.67 s


In [39]:
topic = 1
f_display_docs(lda_doc_topic_matrix, txt_chunks, topic)


----------------
We remain committed to the Middle East peace process, which is key to preventing further radicalization of the region. We continue to hope that asolution can be reached to the Palestinian question. We have also qualitatively upgraded our relations with all the major Powers

------------------
Seventy years ago, the foundations of the United Nations were laid at the San Francisco Conference, in a city on the west coast of this country. India was one of the signatory countries of the Charter of the United Nations, although at that time we were not independent. We obtained our independence two years later. When the United Nations was established, a rather diminutive-looking man with the powerful weapon of non-violence was writing the final act in a struggle that would become a symbol of hope for the colonized and the oppressed everywhere. I am grateful that the United Nations has proclaimed the birthday of this extraordinary man as the International Day of Non-Violence. 

In [41]:
topic = 2
f_display_docs(lda_doc_topic_matrix, txt_chunks, topic)


----------------
As a nuclear-weapon State, India is aware of its responsibility, and its support for global, non-discriminatory and verifiable nuclear disarmament is undiminished

------------------
Africa is a region with which we have historical bonds, a solidarity born out of a common struggle against colonialism and a belief in a future of shared prosperity. Later this month we will host the third India- Africa Forum Summit, the first time that the leaders of all 54 African countries have been invited

------------------
Yet, when we ask ourselves whether we have been able to prevent conflicts taking place in various parts of the world, the answer is “no”. If we ask whether we were able to find permanent solutions to those conflicts, the answer is “no”. If we ask whether we were able to show the path of peace to a world which is treading the path of violence, the answer is “no”. According to those parameters, the United Nations appears to be ineffective at maintaining internation

### CONCLUSIONS:

- It was found that NMF technique yielded the best results it terms of   
topic detection and documents associated with each topic.  
It also took the least amout of time to complete the taskv~ 500ms

- SVD & LDA techniques gave less desirable results, with repreations in  
keywords across various topics.    
The documents associated with each topic was not found to be very congruent   
with the topic keywords.  
SVD took ~ 17 secs and LDA took ~ 9 mins to complete the task.