In [12]:
from sentence_transformers import SentenceTransformer, util
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.metrics.cluster import contingency_matrix
import spacy
import spacy_universal_sentence_encoder
import datetime

import numpy as np
from joblib import Parallel, delayed
from sklearn.utils import check_random_state
from sklearn.metrics.pairwise import distance_metrics
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from joblib import Parallel, delayed
from itertools import combinations

USE_model = spacy_universal_sentence_encoder.load_model('en_use_lg')
Word2Vec_model = spacy.load('en_core_web_lg')
BERT_model = SentenceTransformer('paraphrase-distilroberta-base-v1')

In [7]:
# custom dataset
def getCSV():
    df = pd.read_csv('custom_dataset.csv', encoding = "mac-roman")
    return df

# SentenceBERT
def getSentenceBERT(text):
    embeddings = BERT_model.encode(text, show_progress_bar=True, convert_to_numpy=True)
    return embeddings

# Clustering Models
def getClustersByThreshold(embeddings, linkageMethod, linkageMetric, threshold):
    clusters = linkage(embeddings, method = linkageMethod, metric = linkageMetric)
    flatClusters = fcluster(clusters, t=threshold, criterion='distance')
    return flatClusters

def evaluation(predictTags, tags, evalMetric):
    print(str(len(set(predictTags))) + " Clusters found.")

    result = {}
    result['Adjusted Rand Score'] = metrics.adjusted_rand_score(predictTags, tags)
    result['Homogenity'] = metrics.homogeneity_completeness_v_measure(predictTags, tags)[0]
    result['Completeness'] = metrics.homogeneity_completeness_v_measure(predictTags, tags)[1]
    result['V Measure'] = metrics.homogeneity_completeness_v_measure(predictTags, tags)[2]
    result_df = pd.DataFrame(data=result, index=[0])
    display(result_df)
    
    c_matrix = contingency_matrix(tags, predictTags)
    print("vertical length: " + str(len(c_matrix)))
    print("horizontal length: "+ str(len(c_matrix[0])))
    print(c_matrix)
    
    return c_matrix, result_df

# Code to convert result for frontend

In [23]:
# Generate json
NER = ["PERSON", "NORP", "FAC", "ORG", "GPE", "LOC", "EVENT"]

def getTopNouns(article):
    dic = {}
    for i in range(len(article)):
        doc = Word2Vec_model(article[i])
        for token in doc:
            if token.pos_ == "PROPN" or token.pos_ == "NOUN":
                word = " ".join([t.lemma_ for t in Word2Vec_model(token.text)])
                if word in dic:
                    dic[word] += 1
                else:
                    dic[word] = 1
    dic = sorted(list(dic.items()), key = lambda x: x[1], reverse = True)
    return dic[:10]

def getTopNER(article):
    dic = {}
    for i in range(len(article)):
        doc = Word2Vec_model(article[i])
        for token in doc.ents:
            if token.label_ in NER:
                if token.text in dic:
                    dic[token.text] += 1
                else:
                    dic[token.text] = 1
    dic = sorted(list(dic.items()), key = lambda x: x[1], reverse = True)
    return dic[:10]


def getPrediction(article):
    firstEmbeddings = getSentenceBERT(article)
    firstPrediction = getClustersByThreshold(firstEmbeddings, "complete","cosine", 0.86).tolist()
    
    result = {}
    for i in range(len(firstPrediction)):
        if firstPrediction[i] in result:
            result[firstPrediction[i]].append(firstEmbeddings[i])
        else:
            result[firstPrediction[i]] = [firstEmbeddings[i]]
    result = sorted(list(result.items()), key = lambda x : x[0])
    
    secondEmbeddings = []
    for key, value in result:
        secondEmbeddings.append(np.mean(value, axis = 0))
    
    if len(secondEmbeddings) == 1:
        return firstPrediction
    secondPrediction = getClustersByThreshold(secondEmbeddings, "complete","cosine", 0.86).tolist()
    
    finalPrediction = firstPrediction.copy()
    for i in range(len(finalPrediction)):
        finalPrediction[i] = secondPrediction[firstPrediction[i] - 1]
    
    return finalPrediction

def getJSON(source, date, article, url):
    if len(date) == 0:
        return {"clusters": [], "total_number_of_articles": 0, "total_number_of_clusters": 0}
    elif len(date) == 1:
        prediction = [1]
    else:
        prediction = getPrediction(article)

    result = {}
    for i in range(len(prediction)):
        target = prediction[i]
        if target in result:
            result[target]['total_number_of_articles'] += 1
            result[target]['sources'].append(source[i])
            result[target]['dates'].append(date[i])
            result[target]['articles'].append(article[i])
            result[target]['urls'].append(url[i])
        else:
            result[target] = {
                            "total_number_of_articles": 1,
                            "keywords": [],
                            "sources": [source[i]],
                            "dates": [date[i]],
                            "articles": [article[i]],
                            "urls": [url[i]]}

    json = {"clusters": [], "total_number_of_articles": len(prediction), "total_number_of_clusters": max(prediction)}
    for key, value in result.items():
        value['nounKeywords'] = getTopNouns(value['articles'])
        value['nerKeywords'] = getTopNER(value['articles'])
        json['clusters'].append(value)
    json['clusters'] = sorted(json['clusters'], key=lambda x: x['total_number_of_articles'], reverse = True)
    for i in range(len(json['clusters'])):
        json['clusters'][i]['cluster'] = i + 1
    
    return json

def parseDate(string):
    date = string.split('-')
    year = int(date[0])
    month = int(date[1])
    day = int(date[2])
    return datetime.datetime(year, month, day)


def getAll():
    df = getCSV()
    json = getJSON(list(df['Source']), list(df['Date']), list(df['Title']), list(df['URL']))
    json['cluster_type'] = "all"
    json['date'] = "2021-01-05"
    json['week'] = None
    json['month'] = None
    
    return json
    
def getByMonth(month):
    df = getCSV()
    source = list(df['Source'])
    date = list(df['Date'])
    title = list(df['Title'])
    url = list(df['URL'])
    
    target_source = []
    target_date = []
    target_title = []
    target_url = []
    for i in range(len(date)):
        if int(date[i].split('-')[1]) == month:
            target_source.append(source[i])
            target_date.append(date[i])
            target_title.append(title[i])
            target_url.append(url[i])

    json = getJSON(target_source, target_date, target_title, target_url)
    json['cluster_type'] = "month"
    startDate = parseDate("2021-" + str(month) + "-01")
    json['date'] = startDate.strftime("%Y") + "-" + startDate.strftime("%m") + "-" + startDate.strftime("%d")
    json['week'] = None
    json['month'] = startDate.strftime("%b") + " " + startDate.strftime("%Y")
    
    return json
        
def getByWeek(s):
    startingDay = parseDate(s)
    endingDay = startingDay + datetime.timedelta(days=6)
    
    df = getCSV()
    source = list(df['Source'])
    date = list(df['Date'])
    title = list(df['Title'])
    url = list(df['URL'])
        
    target_source = []
    target_date = []
    target_title = []
    target_url = []
    for i in range(len(date)):
        targetDay = parseDate(date[i])
        if startingDay <= targetDay <= endingDay:
            target_source.append(source[i])
            target_date.append(date[i])
            target_title.append(title[i])
            target_url.append(url[i])
    
    json = getJSON(target_source, target_date, target_title, target_url)
    json['cluster_type'] = "week"
    json['date'] = s
    weekRange = startingDay.strftime("%Y") + "-" + startingDay.strftime("%m") + "-" + startingDay.strftime("%d")
    weekRange += " - "
    weekRange += endingDay.strftime("%Y") + "-" + endingDay.strftime("%m") + "-" + endingDay.strftime("%d")
    json['week'] = weekRange
    json['month'] = None
    
    return json

def getByDay(day):
    df = getCSV()
    source = list(df['Source'])
    date = list(df['Date'])
    title = list(df['Title'])
    url = list(df['URL'])
        
    target_source = []
    target_date = []
    target_title = []
    target_url = []
    for i in range(len(date)):
        if day == date[i]:
            target_source.append(source[i])
            target_date.append(date[i])
            target_title.append(title[i])
            target_url.append(url[i])
    
    json = getJSON(target_source, target_date, target_title, target_url)
    json['cluster_type'] = "day"
    json['date'] = day
    json['week'] = None
    json['month'] = None
    
    return json




In [24]:
import json
from numpyencoder import NumpyEncoder

months = {"January": 1,
         "February": 2,
         "March": 3}
weeks = ['2021-01-03',
        '2021-01-10',
        '2021-01-17',
        '2021-01-24',
        '2021-01-31',
        '2021-02-07',
        '2021-02-14',
        '2021-02-21',
        '2021-02-28',
        '2021-03-07',
         '2021-03-14',
         '2021-03-21',
         '2021-03-28'
        ]

output = []

#output all
out = getAll()
output.append(out)


#output by month
byMonth = "byMonth/"
for key, value in months.items():
    out = getByMonth(value)
    output.append(out)
    

#output by week
byWeek = "byWeek/"
for startingWeek in weeks:
    out = getByWeek(startingWeek)
    output.append(out)
        
#output by day
byDay = "byDay/"
start = datetime.datetime(2021, 1, 1)
end = datetime.datetime(2021, 4, 1)
while start < end:
    out = getByDay(start.strftime("%Y") + "-" + start.strftime("%m") + "-" + start.strftime("%d"))
    output.append(out)
    start += datetime.timedelta(days=1)
    

with open("outputAsJson.json", 'w') as outfile:
    json.dump(output, outfile, cls=NumpyEncoder)

Batches:   0%|          | 0/18 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/9 [00:00<?, ?it/s]

Batches:   0%|          | 0/9 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
out

{'clusters': [{'total_number_of_articles': 215,
   'keywords': [],
   'sources': ['Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
    'Financial Times',
  