In [3]:
from sentence_transformers import SentenceTransformer, util
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.metrics.cluster import contingency_matrix
import spacy
import spacy_universal_sentence_encoder

import numpy as np
from joblib import Parallel, delayed
from sklearn.utils import check_random_state
from sklearn.metrics.pairwise import distance_metrics
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from joblib import Parallel, delayed
from itertools import combinations

USE_model = spacy_universal_sentence_encoder.load_model('en_use_lg')
Word2Vec_model = spacy.load('en_core_web_lg')
BERT_model = SentenceTransformer('paraphrase-distilroberta-base-v1')

In [428]:
# custom dataset
def getCSV():
    df = pd.read_csv('custom_dataset.csv', encoding = "mac-roman")
    return df

# SentenceBERT
def getSentenceBERT(text):
    embeddings = BERT_model.encode(text, show_progress_bar=True, convert_to_numpy=True)
    return embeddings

# Clustering Models
def getClustersByThreshold(embeddings, linkageMethod, linkageMetric, threshold):
    clusters = linkage(embeddings, method = linkageMethod, metric = linkageMetric)
    flatClusters = fcluster(clusters, t=threshold, criterion='distance')
    return flatClusters

def evaluation(predictTags, tags, evalMetric):
    print(str(len(set(predictTags))) + " Clusters found.")

    result = {}
    result['Adjusted Rand Score'] = metrics.adjusted_rand_score(predictTags, tags)
    result['Homogenity'] = metrics.homogeneity_completeness_v_measure(predictTags, tags)[0]
    result['Completeness'] = metrics.homogeneity_completeness_v_measure(predictTags, tags)[1]
    result['V Measure'] = metrics.homogeneity_completeness_v_measure(predictTags, tags)[2]
    result_df = pd.DataFrame(data=result, index=[0])
    display(result_df)
    
    c_matrix = contingency_matrix(tags, predictTags)
    print("vertical length: " + str(len(c_matrix)))
    print("horizontal length: "+ str(len(c_matrix[0])))
    print(c_matrix)
    
    return c_matrix, result_df

# Code to convert result for frontend

In [201]:
# Generate json

def getJSON(source, date, article, url):
    if len(date) == 0:
        return {"clusters": [], "total number of articles": 0, "total number of clusters": 0}
    elif len(date) == 1:
        prediction = [1]
    else:
        embeddings = getSentenceBERT(article)
        prediction = getClustersByThreshold(embeddings, "complete","cosine", 0.86).tolist()

    result = {}
    for i in range(len(prediction)):
        target = prediction[i]
        if target in result:
            result[target]['total_number_of_articles'] += 1
            result[target]['sources'].append(source[i])
            result[target]['dates'].append(date[i])
            result[target]['articles'].append(article[i])
            result[target]['urls'].append(url[i])
        else:
            result[target] = {"cluster": target,
                                    "total_number_of_articles": 1,
                                    "keywords": [],
                                    "sources": [source[i]],
                                    "dates": [date[i]],
                                    "articles": [article[i]],
                                    "urls": [url[i]]}

    json = {"clusters": [], "total number of articles": len(prediction), "total number of clusters": max(prediction)}
    for key, value in result.items():
        json['clusters'].append(value)
    json['clusters'] = sorted(json['clusters'], key=lambda x: x['cluster'])
    
    return json

def parseDate(string):
    date = string.split('-')
    year = int(date[0])
    month = int(date[1])
    day = int(date[2])
    return datetime.datetime(year, month, day)


def getAll():
    df = getCSV()
    json = getJSON(list(df['Source']), list(df['Date']), list(df['Title']), list(df['URL']))
    json['cluster_type'] = "all"
    json['date'] = "2021-01-05"
    json['week'] = None
    json['month'] = None
    
    return json
    
def getByMonth(month):
    df = getCSV()
    source = list(df['Source'])
    date = list(df['Date'])
    title = list(df['Title'])
    url = list(df['URL'])
    
    target_source = []
    target_date = []
    target_title = []
    target_url = []
    for i in range(len(date)):
        if int(date[i].split('-')[1]) == month:
            target_source.append(source[i])
            target_date.append(date[i])
            target_title.append(title[i])
            target_url.append(url[i])

    json = getJSON(target_source, target_date, target_title, target_url)
    json['cluster_type'] = "month"
    startDate = parseDate("2021-" + str(month) + "-01")
    json['date'] = startDate.strftime("%Y") + "-" + startDate.strftime("%m") + "-" + startDate.strftime("%d")
    json['week'] = None
    json['month'] = startDate.strftime("%b") + " " + startDate.strftime("%Y")
    
    return json
        
def getByWeek(s):
    startingDay = parseDate(s)
    endingDay = startingDay + datetime.timedelta(days=6)
    
    df = getCSV()
    source = list(df['Source'])
    date = list(df['Date'])
    title = list(df['Title'])
    url = list(df['URL'])
        
    target_source = []
    target_date = []
    target_title = []
    target_url = []
    for i in range(len(date)):
        targetDay = parseDate(date[i])
        if startingDay <= targetDay <= endingDay:
            target_source.append(source[i])
            target_date.append(date[i])
            target_title.append(title[i])
            target_url.append(url[i])
    
    json = getJSON(target_source, target_date, target_title, target_url)
    json['cluster_type'] = "week"
    json['date'] = s
    weekRange = startingDay.strftime("%Y") + "-" + startingDay.strftime("%m") + "-" + startingDay.strftime("%d")
    weekRange += " - "
    weekRange += endingDay.strftime("%Y") + "-" + endingDay.strftime("%m") + "-" + endingDay.strftime("%d")
    json['week'] = weekRange
    json['month'] = None
    
    return json

def getByDay(day):
    df = getCSV()
    source = list(df['Source'])
    date = list(df['Date'])
    title = list(df['Title'])
    url = list(df['URL'])
        
    target_source = []
    target_date = []
    target_title = []
    target_url = []
    for i in range(len(date)):
        if day == date[i]:
            target_source.append(source[i])
            target_date.append(date[i])
            target_title.append(title[i])
            target_url.append(url[i])
    
    json = getJSON(target_source, target_date, target_title, target_url)
    json['cluster_type'] = "day"
    json['date'] = day
    json['week'] = None
    json['month'] = None
    
    return json




In [209]:
import json
from numpyencoder import NumpyEncoder

months = {"January": 1,
         "February": 2,
         "March": 3}
weeks = ['2021-01-03',
        '2021-01-10',
        '2021-01-17',
        '2021-01-24',
        '2021-01-31',
        '2021-02-07',
        '2021-02-14',
        '2021-02-21',
        '2021-02-28',
        '2021-03-07',
         '2021-03-14',
         '2021-03-21',
         '2021-03-28'
        ]

output = []

#output all
out = getAll()
output.append(out)
print("Finished " + targetFile)


#output by month
byMonth = "byMonth/"
for key, value in months.items():
    out = getByMonth(value)
    output.append(out)
    

#output by week
byWeek = "byWeek/"
for startingWeek in weeks:
    out = getByWeek(startingWeek)
    output.append(out)
        
#output by day
byDay = "byDay/"
start = datetime.datetime(2021, 1, 1)
end = datetime.datetime(2021, 4, 1)
while start < end:
    out = getByDay(start.strftime("%Y") + "-" + start.strftime("%m") + "-" + start.strftime("%d"))
    output.append(out)
    start += datetime.timedelta(days=1)
    

with open("outputAsJson.json", 'w') as outfile:
    json.dump(output, outfile, cls=NumpyEncoder)

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Finished text.json


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/9 [00:00<?, ?it/s]

Batches:   0%|          | 0/9 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

# Named Entity Recognition

PERSON People, including fictional

NORP Nationalities or religious or political groups

FACILITY Buildings, airports, highways, bridges, etc.

ORGANIZATION Companies, agencies, institutions, etc.

GPE Countries, cities, states

LOCATION Non-GPE locations, mountain ranges, bodies of water

PRODUCT Vehicles, weapons, foods, etc. (Not services)

EVENT Named hurricanes, battles, wars, sports events, etc.

WORK OF ART Titles of books, songs, etc.

LAW Named documents made into laws 

LANGUAGE Any named language

DATE Absolute or relative dates or periods

TIME Times smaller than a day

PERCENT Percentage (including “%”)

MONEY Monetary values, including unit

QUANTITY Measurements, as of weight or distance

ORDINAL “first”, “second”

CARDINAL Numerals that do not fall under another typ


In [538]:
# sentence = "Amazon Needs Electric Vehicles, Too"
sentence = "Shift to Electric Vehicles Spurs Bid to Make More Batteries in U.S."

doc = Word2Vec_model(sentence)

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

Electric Vehicles Spurs Bid to Make More Batteries 9 59 ORG
U.S. 63 67 GPE


In [539]:
for token in doc:
    print(token.text, token.lemma_.lower(), token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Shift shift NOUN NN nsubj Xxxxx True False
to to ADP IN prep xx True True
Electric electric PROPN NNP compound Xxxxx True False
Vehicles vehicle NOUN NNS pobj Xxxxx True False
Spurs spur VERB VBZ ROOT Xxxxx True False
Bid bid PROPN NNP dobj Xxx True False
to to PART TO aux xx True True
Make make VERB VB advcl Xxxx True True
More more ADJ JJR amod Xxxx True True
Batteries battery NOUN NNS dobj Xxxxx True False
in in ADP IN prep xx True True
U.S. u.s. PROPN NNP pobj X.X. False False


In [336]:
NER = ["PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", "EVENT"]

def getTopNouns(article):
    dic = {}
    for i in range(len(article)):
        doc = Word2Vec_model(article[i])
        for token in doc:
            if token.pos_ == "PROPN" or token.pos_ == "NOUN":
                word = " ".join([t.lemma_ for t in Word2Vec_model(token.text)])
                if word in dic:
                    dic[word] += 1
                else:
                    dic[word] = 1
    dic = sorted(list(dic.items()), key = lambda x: x[1], reverse = True)
    return dic[:10]
    
def getTopNER(article):
    dic = {}
    for i in range(len(article)):
        doc = Word2Vec_model(article[i])
        for token in doc.ents:
            if token.label_ in NER:
                if token.text in dic:
                    dic[token.text] += 1
                else:
                    dic[token.text] = 1
    dic = sorted(list(dic.items()), key = lambda x: x[1], reverse = True)
    return dic[:10]

def getKeyword(source, date, article, url):
    embeddings = getSentenceBERT(article)
    prediction = getClustersByThreshold(embeddings, "complete","cosine", 0.86).tolist()

    result = {}
    for i in range(len(prediction)):
        target = prediction[i]
        if target in result:
            result[target]['total_number_of_articles'] += 1
            result[target]['sources'].append(source[i])
            result[target]['dates'].append(date[i])
            result[target]['articles'].append(article[i])
            result[target]['urls'].append(url[i])
        else:
            result[target] = {
                            "total_number_of_articles": 1,
                            "nounKeywords": [],
                            "nerKeywords": [],
                            "sources": [source[i]],
                            "dates": [date[i]],
                            "articles": [article[i]],
                            "urls": [url[i]]}

    json = {"clusters": [], "total number of articles": len(prediction), "total number of clusters": max(prediction)}
    for key, value in result.items():
        value['nounKeywords'] = getTopNouns(value['articles'])
        value['nerKeywords'] = getTopNER(value['articles'])
        json['clusters'].append(value)
    json['clusters'] = sorted(json['clusters'], key=lambda x: x['total_number_of_articles'], reverse = True)
    for i in range(len(json['clusters'])):
        json['clusters'][i]['cluster'] = i + 1
    
    return json

df = getCSV()
json = getKeyword(df['Source'], df['Date'], df['Title'], df['URL'])
for i in range(len(json['clusters'])):
    cluster = json['clusters'][i]
    print("Cluster " + str(cluster['cluster']))
    print("Number of articls: " + str(cluster['total_number_of_articles']))
    print("\n")
    print("Retrieved Nouns:")
    print(cluster['nounKeywords'])
    print("Retrieved NER:")
    print(cluster['nerKeywords'])
    print("\n")

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Cluster 1
Number of articls: 43


Retrieved Nouns:
[('China', 43), ('U.S.', 30), ('Biden', 6), ('trade', 5), ('Hong', 5), ('Kong', 5), ('US', 3), ('move', 3), ('Xinjiang', 3), ('charge', 3)]
Retrieved NER:
[('China', 35), ('U.S.', 23), ('Hong Kong', 5), ('Biden', 4), ('Xinjiang', 3), ('US', 2), ('Beijing', 2), ('Taiwan', 2), ('Grand Chip Ambitions', 1), ('AI', 1)]


Cluster 2
Number of articls: 31


Retrieved Nouns:
[('Hong', 30), ('Kong', 30), ('vaccine', 6), ('quarantine', 5), ('stock', 3), ('leader', 3), ('people', 3), ('China', 3), ('time', 2), ('jab', 2)]
Retrieved NER:
[('Hong Kong', 26), ('China', 3), ('UK', 2), ('Japan', 1), ('Macau', 1), ('Sino-U.S.', 1), ('Hong Kong Vaccine No-Show Rate', 1), ('Hong Kong Stamp Duty Hike', 1), ('Shocks Traders', 1), ('U.S.', 1)]


Cluster 3
Number of articls: 26


Retrieved Nouns:
[('EU', 18), ('UK', 13), ('trade', 7), ('Brexit', 6), ('Frost', 5), ('relation', 4), ('London', 4), ('export', 3), ('business', 3), ('ban', 3)]
Retrieved NER:
[('EU'

# Merging clusters

# Result from BERT -> 61 clusters


[ 0  2  0  0  1  1  0  1  4  1  4  3  2 14 26  0  4 15 10  2  2  6  3  2 13  1  9  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  2  0  0  0 0  0  0  0  0  1  0  1  0  1  0  1  2]


[20 12 17  3 13  4  4  0  0  0  0  0  2  0  0  0  0  0  0  0  1  1  0  0 0  0  0  1  0  1  0  0  0  0  0  0  0  0  0  0  2  5  2  1  0  7  6  0 0  0  0  0  0  0  0  0  0  0  1  0  0]

[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  3  1  0  3 11  3 18 18 15  5 15  0  4  1  0  0  2  1  0]

[ 0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  2  0  0  0  1  0  0  1  0 0  1  0  2  3 14  0  2 24 20  3 30  0  0  0  0  0  0  0  0  0  0  0  0 0  0  1  0  0  1  0  0  1  2  1  1  1]

[ 1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1  0  0  0 0  0  0  0  0  0 11 40  0  0  1  1 10  9 11 15  0  0  1  5  2  0  0  0 0  0  0  0  0  0  0  0  0  0  1  0  0]



![image.png](attachment:image.png)

NSTM: Real-Time Query-Driven News Overview Composition at Bloomberg

# Trail 1: Concatenate all articles in a cluster

In [507]:
df = getCSV()
artilce= df['Title']
source = df['Source']
url  = df['URL']
date = df['Date']
tag = getCSV()['Tag']

embeddings = getSentenceBERT(article)
o_prediction = getClustersByThreshold(embeddings, "complete","cosine", 0.86).tolist()

result = {}
for i in range(len(o_prediction)):
    if o_prediction[i] in result:
        result[o_prediction[i]].append(article[i])
    else:
        result[o_prediction[i]] = [article[i]]

sentence = []
result = sorted(list(result.items()), key = lambda x : x[0])
for key, value in result:
    print(key)
    sentence.append('. '.join(value))

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61


In [508]:
print(len(sentence))
print(sentence)

61


In [509]:
embeddings = getSentenceBERT(sentence)
prediction = getClustersByThreshold(embeddings, "complete","cosine", 0.86).tolist()

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

In [510]:
print(len(prediction))
print(max(prediction))
print(prediction)


61
14
[9, 9, 9, 8, 9, 8, 13, 14, 5, 2, 1, 1, 5, 6, 6, 10, 10, 6, 6, 6, 12, 6, 6, 5, 6, 6, 6, 7, 6, 6, 13, 13, 6, 6, 13, 6, 13, 13, 12, 13, 9, 9, 9, 13, 12, 9, 9, 11, 11, 11, 11, 11, 11, 3, 3, 6, 4, 6, 6, 7, 2]


In [511]:
for i in range(len(o_prediction)):
    o_prediction[i] = prediction[o_prediction[i] - 1]
    
matrix_concat, df_concat = evaluation(o_prediction, tag, "cosine")

14 Clusters found.


Unnamed: 0,Adjusted Rand Score,Homogenity,Completeness,V Measure
0,0.506453,0.549964,0.609561,0.578231


vertical length: 5
horizontal length: 14
[[  7   3   1   0   8 101   1   1   3   4   0   4   1   1]
 [  0   0   0   0   2   3   1   7  84   0   0   1   5   0]
 [  0   0   4   0   0   3   1   0  17   0  74   0   2   0]
 [  0   2   1   1   0  97   3   0   0   2   1   0   5   0]
 [  0   0   0   0   0   2   0   0   2   0   0  14  91   0]]


# Trail 2: Average of BERT

In [512]:
df = getCSV()
artilce= df['Title']
source = df['Source']
url  = df['URL']
date = df['Date']
tag = getCSV()['Tag']

embeddings = getSentenceBERT(article)
o_prediction = getClustersByThreshold(embeddings, "complete","cosine", 0.86).tolist()

result = {}
for i in range(len(o_prediction)):
    if o_prediction[i] in result:
        result[o_prediction[i]].append(embeddings[i])
    else:
        result[o_prediction[i]] = [embeddings[i]]

sentence = []
result = sorted(list(result.items()), key = lambda x : x[0])
for key, value in result:
    print(key)
    sentence.append(np.mean(value, axis = 0))

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61


In [513]:
print(len(sentence))
print(sentence)

61
[array([ 1.39479369e-01,  4.69210565e-01,  2.52628699e-02, -6.75766394e-02,
       -1.22344062e-01,  7.44258538e-02, -1.01781696e-01, -1.45941228e-01,
        1.94254190e-01, -1.29281238e-01,  3.97098184e-01,  3.16947132e-01,
        7.17601180e-02,  1.09412400e-02, -1.50706368e-02,  5.03188185e-02,
       -2.63015628e-02,  1.14859797e-01, -8.28527585e-02, -7.38765597e-02,
        4.05904874e-02,  8.88936780e-03,  2.94293493e-01,  1.41793653e-01,
        4.89978231e-02, -1.14497952e-02,  4.88141142e-02,  6.70599863e-02,
       -8.38216394e-02,  4.71614972e-02,  8.61802772e-02, -1.40210271e-01,
        2.26848006e-01,  2.43569642e-01,  1.78511925e-02,  1.85172960e-01,
        2.92999297e-02, -5.82360849e-03, -1.97943181e-01, -1.13705657e-01,
        1.32409573e-01,  2.00368717e-01,  1.69138759e-02,  7.55948126e-02,
        1.66865930e-01, -1.25584662e-01, -1.08448371e-01,  6.85020834e-02,
       -7.57340044e-02, -1.45468891e-01, -5.90794608e-02, -6.30668998e-02,
        7.83176720e-0

In [514]:
prediction = getClustersByThreshold(sentence, "complete","cosine", 0.86).tolist()

In [515]:
print(len(prediction))
print(max(prediction))
print(prediction)


61
10
[9, 9, 9, 9, 9, 8, 9, 4, 7, 1, 7, 1, 7, 6, 6, 3, 7, 6, 6, 6, 6, 7, 10, 7, 6, 10, 6, 8, 6, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 5, 10, 6, 5, 5, 5, 5, 5, 5, 5, 5, 3, 5, 6, 2, 10, 10, 8, 1]


In [516]:
for i in range(len(o_prediction)):
    o_prediction[i] = prediction[o_prediction[i] - 1]
    
matrix_average, df_average = evaluation(o_prediction, tag, "cosine")

10 Clusters found.


Unnamed: 0,Adjusted Rand Score,Homogenity,Completeness,V Measure
0,0.537448,0.609783,0.613461,0.611616


vertical length: 5
horizontal length: 10
[[  6   0   1   1   0  94  22   2   3   6]
 [  0   0   0   0  15   1   3   5  76   3]
 [  0   0   0   0  95   1   0   1   0   4]
 [  2   1   3   0   1   4   0   3   0  98]
 [  0   0   0   0   1   3   0   0   1 104]]


# Trail 3: Use of retrieved Nouns

In [517]:
df = getCSV()
artilce= df['Title']
source = df['Source']
url  = df['URL']
date = df['Date']
tag = getCSV()['Tag']

embeddings = getSentenceBERT(article)
o_prediction = getClustersByThreshold(embeddings, "complete","cosine", 0.86).tolist()

result = {}
for i in range(len(o_prediction)):
    target = o_prediction[i]
    if target in result:
        result[target]['total_number_of_articles'] += 1
        result[target]['sources'].append(source[i])
        result[target]['dates'].append(date[i])
        result[target]['articles'].append(article[i])
        result[target]['urls'].append(url[i])
    else:
        result[target] = {
                        "total_number_of_articles": 1,
                        "nounKeywords": [],
                        "nerKeywords": [],
                        "sources": [source[i]],
                        "dates": [date[i]],
                        "articles": [article[i]],
                        "urls": [url[i]]}

sentences = []
result = sorted(list(result.items()), key = lambda x : x[0])
for key, value in result:
    print(key)
    value['nounKeywords'] = getTopNouns(value['articles'])
    value['nerKeywords'] = getTopNER(value['articles'])
    sentences.append(" ".join(map(lambda x: x[0], value['nounKeywords'])))



Batches:   0%|          | 0/18 [00:00<?, ?it/s]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61


In [518]:
sentences

['bitcoin cryptocurrency price US Coinbase boom btc credibility record Dogecoin',
 'crypto cryptocurrencie bitcoin cryptocurrency new york risk Beeple NFT bank',
 'bitcoin money stone Yap Citi report market value Tether Bitfinex',
 'Robinhood collector money slice blockchain basketball action user month cryptocurrency',
 'cryptocurrency bitcoin cryptocurrencie China SEC analysis concern UK competition watchdog',
 'ripple SEC labour Starmer fix Johnson story Coinbase offer doc',
 'currency realm China Coinbase bitcoin market state defence blockchain control',
 'Aviva Blanc',
 'Euronext Brexit job Sweden flag trade cause EU race european',
 'task democracy house order patriot Hong Kong country system',
 'independence Scotland movement letter drive flash pan split campaign moment',
 'politic equivalence tussle UK lesson Brexit power level agenda',
 'WTO nft vaccine dispute role Factbox gaming crypto artist court',
 'Brexit EU UK datum Brussels Ireland protocol approach northern decision',

In [519]:
embeddings = getSentenceBERT(sentences)
prediction = getClustersByThreshold(embeddings, "complete","cosine", 0.86).tolist()

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

In [520]:
print(len(prediction))
print(max(prediction))
print(prediction)

61
10
[7, 7, 7, 9, 7, 1, 7, 10, 1, 3, 4, 4, 9, 4, 4, 3, 4, 4, 4, 3, 2, 3, 1, 1, 4, 3, 4, 1, 3, 3, 8, 8, 3, 3, 3, 3, 3, 8, 8, 8, 7, 7, 8, 8, 8, 1, 1, 6, 6, 6, 6, 6, 2, 3, 2, 9, 5, 1, 9, 1, 3]


In [521]:
for i in range(len(o_prediction)):
    o_prediction[i] = prediction[o_prediction[i] - 1]
    
matrix_noun, df_noun = evaluation(o_prediction, tag, "cosine")

10 Clusters found.


Unnamed: 0,Adjusted Rand Score,Homogenity,Completeness,V Measure
0,0.613654,0.577791,0.697097,0.631861


vertical length: 5
horizontal length: 10
[[ 12   2  13  98   0   0   3   3   3   1]
 [ 18   1   2   0   0   0  73   3   6   0]
 [ 15  19   0   0   0  59   0   5   3   0]
 [  6   0 101   0   1   1   0   2   1   0]
 [  0   1  12   0   0   0   1  94   1   0]]


In [524]:
print("Concat:")
display(df_concat)
print("Average:")
display(df_average)
print("Noun:")
display(df_noun)


print("Concat:")
print(matrix_concat)
print("Average:")
print(matrix_average)
print("Noun:")
print(matrix_noun)

Concat:


Unnamed: 0,Adjusted Rand Score,Homogenity,Completeness,V Measure
0,0.506453,0.549964,0.609561,0.578231


Average:


Unnamed: 0,Adjusted Rand Score,Homogenity,Completeness,V Measure
0,0.537448,0.609783,0.613461,0.611616


Noun:


Unnamed: 0,Adjusted Rand Score,Homogenity,Completeness,V Measure
0,0.613654,0.577791,0.697097,0.631861


Concat:
[[  7   3   1   0   8 101   1   1   3   4   0   4   1   1]
 [  0   0   0   0   2   3   1   7  84   0   0   1   5   0]
 [  0   0   4   0   0   3   1   0  17   0  74   0   2   0]
 [  0   2   1   1   0  97   3   0   0   2   1   0   5   0]
 [  0   0   0   0   0   2   0   0   2   0   0  14  91   0]]
Average:
[[  6   0   1   1   0  94  22   2   3   6]
 [  0   0   0   0  15   1   3   5  76   3]
 [  0   0   0   0  95   1   0   1   0   4]
 [  2   1   3   0   1   4   0   3   0  98]
 [  0   0   0   0   1   3   0   0   1 104]]
Noun:
[[ 12   2  13  98   0   0   3   3   3   1]
 [ 18   1   2   0   0   0  73   3   6   0]
 [ 15  19   0   0   0  59   0   5   3   0]
 [  6   0 101   0   1   1   0   2   1   0]
 [  0   1  12   0   0   0   1  94   1   0]]


# Limitation of Noun:

# 1. Sequence Matters

[(China, 30), (Chartered, 20), (Financial, 20), (Analyst, 20)]

In [536]:
a = ["Chartered Financial Analyst have recently link to a scandal",
    "Is Chartered Financial Analyst a must take for analysts?"]
getTopNouns(a)

[('analyst', 3), ('charter', 2), ('financial', 2), ('scandal', 1)]

# 2. Weight

[(United, 20), (States, 20), (China, 20), (Arabia, 1), (oil, 1)]

target -> US-China Cluster

ended up -> Arabia/Oil Cluster


In [544]:
dateresult  = ""
year, month, day = dateresult.split("-")

ValueError: not enough values to unpack (expected 3, got 1)

In [543]:
day

'23'