In [9]:
import spacy
import pandas as pd
from spacy import displacy
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.cluster import AgglomerativeClustering
from sklearn.datasets import make_blobs
from functools import reduce
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

In [11]:
# helper functions
def cluster_visualise_scatter(df, num_clusters, col_name, dfname):
    documents = df[col_name].dropna().tolist()
    vectorizer = TfidfVectorizer(stop_words=stop_word_set)
    idf_vector = vectorizer.fit_transform(documents)
    
    kmeans=KMeans(n_clusters=num_clusters, random_state=42, n_init=10, max_iter=500)
    clusters = kmeans.fit_predict(idf_vector.toarray())
    labels = kmeans.labels_
    
    #dimensionality reduction for visualisation
    tsne = TSNE(n_components=2, random_state=42)
    X_tsne = tsne.fit_transform(idf_vector.toarray())
    
    #Visualisation
    plt.figure(figsize=(8, 6))
    plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=labels, cmap='viridis', marker='o')
    
    #plot and label centroids
    centroids = kmeans.cluster_centers_
    plt.scatter(centroids[:,0], centroids[:,1], c='red', marker='x', s=200, linewidths=3)
    
    #label centroids
    for i, centroid in enumerate(centroids):
        plt.annotate(f'Cluster {i}', (centroid[0], centroid[1]), xytext=(10, 10),
                     textcoords='offset points',
                     fontweight='bold')
    text = 't-SNE Visualization of similar {dfname} excerpts'
    plt.title(text.format(dfname=dfname))
    plt.show() 

    return clusters

def visualize_wordcloud(df, num_clusters, col_name, dfname):
    documents = df[col_name].dropna().tolist()
    vectorizer = TfidfVectorizer(stop_words=stop_word_set)
    idf_vector = vectorizer.fit_transform(documents)
    
    kmeans=KMeans(n_clusters=num_clusters, random_state=42, n_init=10, max_iter=500)
    clusters = kmeans.fit_predict(idf_vector.toarray())
    labels = kmeans.labels_
    
    cluster_docs = []
    for cluster in range(num_clusters):
        current_docs = []
    
        for i in range(len(df[col_name])):
            if labels[i] == cluster:
                current_docs.append(df.at[i, col_name])
                
        cluster_docs.append(current_docs)

    vectorizers = []
    for docs in cluster_docs:
        vectorizers.append(vectorizer1.fit_transform(docs))

    #plot wordcloud for each cluster

    #create figure
    plt.figure(figsize=(20,15), facecolor = None)
    
    #dynamic subplot layout
    rows = int(np.ceil(num_clusters / 2))
    cols = 2 if num_clusters > 1 else 1
   
    idx = 1
    for v in vectorizers:
        tfidf_scores = v.sum(axis=0).A1
        
        # Create dictionary of words and their TF-IDF scores
        word_freq = dict(zip(feature_names, tfidf_scores))
    
        #visualize tf-idf frquencies using wordcloud
        wordcloud = WordCloud(width = 800, height = 800,
                    background_color ='white',
                    stopwords = stopwords,
                    min_font_size = 10).generate_from_frequencies(word_freq)
        plt.subplot(rows, cols, idx)
        plt.imshow(wordcloud)
        plt.title(f"Cluster {cluster}")
        plt.axis("off")
        plt.tight_layout(pad = 0)
        plt.title('Cluster' + str(idx))
        idx+=1
        
    plt.suptitle('K-Means Clustering - Top Clusters', fontsize=16, y=1.02)

In [4]:
#load datasets 
news_excerpts = pd.read_excel('news_excerpts_parsed.xlsx')
wikileaks = pd.read_excel('wikileaks_parsed.xlsx')

#import stopwords
nltk.download('stopwords')
nltk.download('punkt_tab')
stop_word_set = list(stopwords.words('english'))

#Stem words 
# porter = PorterStemmer()
# wikileaks['Stemmed'] = wikileaks['Text'].map(reduce(lambda x: stem_words(x)))
    
#displays visual representation of entity words
#displacy.render(text1, style='ent',jupyter=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\xavie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\xavie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# Tableau Prep

In [6]:
def get_word_count_df(df, stop_words):
    #get word_count dict for wikileaks excerpts
    count_vect = CountVectorizer(stop_words=stop_words)
    
    count_matrix = count_vect.fit_transform(df['Text'])
    vocab = count_vect.vocabulary_
    word_counts = count_matrix.sum(axis=0).A1
    
    #create dict of word counts
    word_freq = dict(zip(count_vect.get_feature_names_out(), word_counts))
    
    count_df = pd.DataFrame(word_freq.items(), columns=['Term', 'Count'])
    count_df = count_df.sort_values('Count', ascending=False)

    return count_df

def get_tfidf_count_df(df, stop_words):
    #get word_count dict for wikileaks excerpts
    tfidf_vect = TfidfVectorizer(stop_words=stop_words)
    
    tfidf_matrix = tfidf_vect.fit_transform(df['Text'])
    vocab = tfidf_vect.vocabulary_
    tfidf_counts = tfidf_matrix.sum(axis=0).A1
    
    #create dict of word counts
    tfidf_freq = dict(zip(tfidf_vect.get_feature_names_out(), tfidf_counts))
    
    tfidf_df = pd.DataFrame(tfidf_freq.items(), columns=['Term', 'Tfidf Sum'])
    tfidf_df = tfidf_df.sort_values('Tfidf Sum', ascending=False)

    return tfidf_df


wiki_count_df = get_word_count_df(wikileaks, stop_word_set)
news_count_df = get_word_count_df(news_excerpts, stop_word_set)
wiki_tfidf_df = get_tfidf_count_df(wikileaks, stop_word_set)
news_tfidf_df = get_tfidf_count_df(news_excerpts, stop_word_set)
print(news_count_df.head())
print(wiki_count_df.head())
print(news_tfidf_df.head())
print(wiki_tfidf_df.head())

            Term  Count
14312       said   1364
15073  singapore    608
18257       year    552
1077        also    426
17391         us    397
          Term  Count
226    airport    248
1877  official    238
2566     staff    154
2099  pristina    136
2846    vendor    126
            Term  Tfidf Sum
14312       said  39.050834
15073  singapore  27.670553
18257       year  23.424519
17391         us  20.786353
3360       china  20.559720
          Term  Tfidf Sum
226    airport   9.242965
1877  official   7.972161
2846    vendor   6.326134
2566     staff   5.823963
2099  pristina   5.638300


In [7]:
#combine both dataframes
news_term_df = news_count_df.merge(news_tfidf_df, on='Term')
wiki_term_df = wiki_count_df.merge(wiki_tfidf_df, on='Term')
print(news_term_df, wiki_term_df)

                Term  Count  Tfidf Sum
0               said   1364  39.050834
1          singapore    608  27.670553
2               year    552  23.424519
3               also    426  18.493828
4                 us    397  20.786353
...              ...    ...        ...
18411      insolvent      1   0.169491
18412    inspections      1   0.148635
18413  inspirational      1   0.119247
18414       inspires      1   0.162357
18415            蘇姿丰      1   0.110509

[18416 rows x 3 columns]                 Term  Count  Tfidf Sum
0            airport    248   9.242965
1           official    238   7.972161
2              staff    154   5.823963
3           pristina    136   5.638300
4             vendor    126   6.326134
...              ...    ...        ...
2941           abuse      1   0.118810
2942       intervene      1   0.091820
2943  interpretation      1   0.129827
2944         abusing      1   0.111075
2945         zealand      1   0.075207

[2946 rows x 3 columns]


In [8]:
#write both dfs to new excel file
with pd.ExcelWriter("./wikileaks_term_counts.xlsx") as writer:
    wiki_term_df.to_excel(writer)
    
with pd.ExcelWriter("./news_term_counts.xlsx") as writer:
    news_term_df.to_excel(writer)

In [1]:
#remove stopwords
# wikileaks['Words'] = wikileaks['Text'].map(lambda x: word_tokenize(x))
# wikileaks['Filtered'] = wikileaks['Words'].map(lambda x: [w for w in x if not w.lower() in stop_word_set])

def extract_entities(NER, text):
    """extract NERs from selected text and return it as a list of dictionaries"""
    doc = NER(text)
    return [
        {
            'text': ent.text,
            'label': ent.label_,
        }
        for ent in doc.ents
    ]
    
NER = spacy.load("en_core_web_sm")

# process labels for each entry, store as DF col
news_excerpts['Labels'] = news_excerpts['Text'].map(lambda x: extract_entities(NER, x))
wikileaks['Labels'] = wikileaks['Text'].map(lambda x: extract_entities(NER, x))
news_count_df['Labels'] = news_count_df['Term'].map(lambda x: extract_entities(NER, x))
print(news_count_df.head())

NameError: name 'spacy' is not defined

In [10]:
#write counts of labels to new excel file
with pd.ExcelWriter("./news_label_counts.xlsx") as writer:
    news_count_df.to_excel(writer)

In [3]:
def process_labels(df):
    """extract each label and store as new col in dataframe"""
    label_types = ['PERSON', 'ORG', 'GPE', 'DATE', 'MONEY', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART']
    for label in label_types:
        df[label] = df['Labels'].map(lambda excerpt: [x for x in excerpt if x.get('label') == label])

process_labels(wikileaks)
process_labels(news_excerpts)

news_excerpts.head()

NameError: name 'wikileaks' is not defined

In [12]:
#write processed df to a new excel sheet for tableau processing
with pd.ExcelWriter("./news_excerpts_labels.xlsx") as writer:
    news_excerpts.to_excel(writer)

In [13]:
#select only the extracted labels
print(news_excerpts.shape)
news_labels = news_excerpts.loc[:,'Persons':'Artwork']
news_labels.head()

(1509, 24)


Unnamed: 0,Persons,Organisations,Geopolitical Entities,Dates,Money,Location,Product,Events,Artwork
0,"[{'text': 'Mara-Louise Anzalone', 'label': 'PE...","[{'text': 'National Labor Relations Board', 'l...",[],"[{'text': 'Thursday', 'label': 'DATE', 'start'...",[],[],[],[],[]
1,"[{'text': 'Su Wenqiang', 'label': 'PERSON', 's...","[{'text': 'Bukit Timah', 'label': 'ORG', 'star...","[{'text': 'Singapore', 'label': 'GPE', 'start'...","[{'text': ""13 months'"", 'label': 'DATE', 'star...","[{'text': 'More than S$3 billion', 'label': 'M...",[],[],[],[]
2,[],"[{'text': 'Meta', 'label': 'ORG', 'start': 0, ...","[{'text': 'the United States', 'label': 'GPE',...","[{'text': 'Monday', 'label': 'DATE', 'start': ...","[{'text': '€1.2 billion', 'label': 'MONEY', 's...","[{'text': 'Europe', 'label': 'LOC', 'start': 4...",[],[],"[{'text': 'Facebook (FB', 'label': 'WORK_OF_AR..."
3,"[{'text': 'Zhang Ruijin', 'label': 'PERSON', '...",[],"[{'text': 'SINGAPORE', 'label': 'GPE', 'start'...","[{'text': '45-year-old', 'label': 'DATE', 'sta...",[],[],[],[],[]
4,[],"[{'text': 'The Department of Education', 'labe...","[{'text': 'Virginia', 'label': 'GPE', 'start':...","[{'text': 'Tuesday', 'label': 'DATE', 'start':...","[{'text': 'a record $14 million', 'label': 'MO...",[],[],[],[]


# Visualisations

cluster_visualise_scatter(wikileaks, 6, 'Text', 'wikileaks')
cluster_visualise_scatter(news_excerpts, 6, 'Text', 'news')

In [16]:
visualize_wordcloud(wikileaks, 6, 'Text', 'wikileaks')
visualize_wordcloud(news_excerpts, 6, 'Text', 'news')



NameError: name 'vectorizer1' is not defined

# Network Visualizations

In [None]:
import networkx as nx

def build_entity_graph(excerpt_entities):
    G = nx.Graph()

    for i, entities in enumerate(excerpt_entities):
        for entity in entities:
            #Add node w metadata
            G.add_node((entity['text'],
                       entity['label'],
                       i))
            
        #connect entities within the same excerpt
        for other_entity in entities:
            if entity != other_entity:
                G.add_edge(entity['text'], other_entity['text'], weight=1)
                
    return G

def build_relationship_dict(excerpt_entities):
    count_dict = {}
    
    for i, entities in enumerate(excerpt_entities):
        for entity in entities   
            for other_entity in entities:
                if entity['text'] != other_entity['text']:
                    entity_pair = tuple(sorted((entity['text'], other_entity['text'])))
                    occ_count = count_dict.get(entity_pair, 0)
                    occ_count += 1
                    count_dict.update({entity_pair: occ_count})
    return count_dict


    


In [None]:
#create relationship_dict and sort it
from collections import OrderedDict
import numpy as np

relationship_dict = build_relationship_dict(wikileaks['Labels'])

# keys = list(relationship_dict.keys())
# values = list(relationship_dict.values())
# values.sort(reverse=True)
# sorted_dict = {keys[i]: values[i] for i in values}

In [None]:
#get unique terms
unique_terms = []
for i in wikileaks['Labels']:
    for j in i:
        unique_terms.append(j['text'])
unique_terms = set(unique_terms)
print(unique_terms)
            

In [None]:
import networkx as nx

def build_relationship_graph(weighted_dict, key_set):
    G = nx.Graph()
    # create nodes
    for key in key_set:
        G.add_node(key)

    #create weighted edges
    #connect entities within the same excerpt
    for key in weighted_dict.keys():
            G.add_edge(key[0], key[1], weight=weighted_dict.get(key))
    return G

relationship_graph = build_relationship_graph(relationship_dict, unique_terms)


In [None]:
#visualise graph
import matplotlib.pyplot as plt
nx.draw(relationship_graph, with_labels=True)
plt.show()
print("done")

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import Dataloader, Dataset