## Streaming K-Means Prototype

In [6]:
import json
with open("some_data_json.txt") as d:
    data = [json.loads(l) for l in d.readlines()]

In [7]:
import re
import numpy as np

def extract_tags(d, outString = False):  
    
    def clean_tag(t):
        parentheses = re.compile(r"\((.+)\)")

        if isinstance(t, str):
            return []
        elif isinstance(t, list):
            return [re.sub(parentheses, "", tag).lower().replace(" ", "").replace(",", "_") for tag in t]
    
    all_tags = clean_tag(d['des_facet']) + clean_tag(d['org_facet']) + \
                    clean_tag(d['per_facet']) + clean_tag(d['geo_facet'])
    
    if outString:
        return " ".join(all_tags)
    else:
        return np.array(all_tags)

tags = [extract_tags(d, True) for d in data]

In [8]:
tags[0:5]

['movies hart_julia',
 'customs_etiquetteandmanners',
 'movies dale_jamesbadge thompson_tessa james_lily dacosta_nia',
 'cookingandcookbooks parties kayne_jenni',
 'illegalimmigration immigrationandemigration borderbarriers booksandliterature unitedstatespoliticsandgovernment borderpatrol trump_donaldj']

In [9]:
from sklearn.cluster import k_means
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(tags)

In [11]:
len(vectorizer.get_feature_names()), X.shape # ok

(703, (243, 703))

In [12]:
centroids, assignments, perplex = k_means(X, 5)

In [14]:
assignments

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 1, 2,
       0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 2, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,
       0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 3, 3, 0,
       3, 3, 3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 0, 3, 3,
       0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,
       0], dtype=int32)

In [18]:
for p in np.where(assignments == 2)[0]:
    print(data[p]["title"])

Read Barr’s News Conference Remarks Ahead of the Mueller Report Release
A New Civil-Rights Movement
Who Is William Barr? He Decides What the Public Can See in Mueller’s Report
Comparing Barr’s Excerpts to Mueller’s Report
Mueller Findings Kick Off a Political Tug of War That’s Only Just Beginning
Live Briefing: A Post-Mueller Report America
Comparing Barr’s Excerpts With Mueller’s Report
The Mueller Report Is Released
Mueller Hints at a National-Security Nightmare
Mueller Findings Kick Off a Political Tug of War That’s Only Just Beginning
Mueller Report Updates: Questions Remain After Special Counsel’s Findings
A Nixonian Attorney General
House Democrats Subpoena Full Mueller Report, and the Underlying Evidence
Why Don McGahn Served the White House, Not Trump
‘I Do Not Remember’: Trump Gave a Familiar Reply to the Special Counsel’s Queries
A Reader’s Guide to the Journalism Behind the Mueller Report
Why It Matters That Trump and Michael Cohen Had a Falling Out
How Trump’s Protector Bec

https://stats.stackexchange.com/questions/102601/clustering-algorithms-for-extremely-sparse-data

## Cosine Similarity

Typically works quite well for text data.

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
cosine_sim = cosine_similarity(X)
np.fill_diagonal(cosine_sim, 0)

In [21]:
cosine_sim

array([[0.        , 0.        , 0.31622777, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.31622777, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.09805807,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.09805807, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [27]:
import pandas as pd
df = pd.DataFrame(cosine_sim)

In [28]:
titles = [d["title"].lower().strip() for d in data]
df.set_axis(titles, inplace=True)
df.set_axis(titles, axis = 1, inplace = True)

In [33]:
df.sort_values('A New Civil-Rights Movement'.lower(), ascending = False).head()

Unnamed: 0,‘fast color’ review: can a gifted family save a parched world?,my sister-in-law is messing up our financial plans,‘little woods’ review: life is thicker than blood,"how to throw a casual, but considered, dinner party at home",one of the deadliest places on the southwest border,"north korea’s state-run economy falters under sanctions, testing elite loyalty","france debates how to rebuild notre-dame, weighing history and modernity","in ‘gentleman jack,’ sally wainwright brings a fascinating life from diary to screen",‘ramy’ is a quietly revolutionary comedy,read barr’s news conference remarks ahead of the mueller report release,...,should you be eating eggs?,your monday briefing,"sri lanka, ukraine, ‘game of thrones’: your monday briefing",make america graze again,a c.e.o.’s plea: don’t mess with the census,the emergence of india’s rahul gandhi,care about gaza? blame hamas,how trump’s protector became mueller’s best witness,elizabeth warren wants to make it easier to prosecute executives,subway fares went up (so did tension between cuomo and the transit chief)
see which sections of the mueller report were redacted,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.353553,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0
a reader’s guide to the journalism behind the mueller report,0.0,0.0,0.0,0.0,0.308607,0.0,0.0,0.0,0.0,0.408248,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.721688,0.113228,0.0
the mueller report is released,0.0,0.0,0.0,0.0,0.308607,0.0,0.0,0.0,0.0,0.612372,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.721688,0.113228,0.0
comparing barr’s excerpts with mueller’s report,0.0,0.0,0.0,0.0,0.285714,0.0,0.0,0.0,0.0,0.755929,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.668153,0.104828,0.0
comparing barr’s excerpts to mueller’s report,0.0,0.0,0.0,0.0,0.285714,0.0,0.0,0.0,0.0,0.755929,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.668153,0.104828,0.0


## Using Abstracts

In [198]:
abstract_vectorizer = CountVectorizer(lowercase=True, stop_words="english", max_df=.70)
abstracts = [d["abstract"] for d in data]
Xabs = abstract_vectorizer.fit_transform(abstracts)

In [199]:
len(abstract_vectorizer.get_feature_names())

747

In [200]:
_, assignments_abs, __ = k_means(Xabs, 3)

In [201]:
for p in np.where(assignments_abs == 1)[0]:
    print(data[p]["title"])

‘Fast Color’ Review: Can a Gifted Family Save a Parched World?
My Sister-in-Law Is Messing Up Our Financial Plans
How to Throw a Casual, but Considered, Dinner Party at Home
One of the Deadliest Places on the Southwest Border
North Korea’s State-Run Economy Falters Under Sanctions, Testing Elite Loyalty
France Debates How to Rebuild Notre-Dame, Weighing History and Modernity
Merkel ‘Saddened’ as Germany Awaits Answers on Deadly Bus Crash in Madeira
Where Are Adults Living With Their Parents?
Celestial Visions on the Met Roof
A New Civil-Rights Movement
The Instagram Face-Lift
California Today: Think Pot Policy Is Settled? Think Again
Homes for Sale in Brooklyn and Manhattan
On the Market in New York City
News Quiz: Test Your Knowledge of the Week’s Headlines
Northern Ireland Journalist Is Killed in Street Clash
Word + Quiz: proclivity
North Carolina Coach Sylvia Hatchell Resigns After Investigation
Leaning Into His Role as the Villain, Ben Simmons Leads 76ers’ Rout of Nets
With the Bir

In [202]:
cosine_sim_abs = cosine_similarity(Xabs)
np.fill_diagonal(cosine_sim_abs, 0)

In [203]:
cosine_sim_abs

array([[0.        , 0.        , 0.07715167, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.07715167, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [204]:
dfa = pd.DataFrame(cosine_sim_abs)
titles = [d["title"].lower().strip() for d in data]
dfa.set_axis(titles, inplace=True)
dfa.set_axis(titles, axis = 1, inplace = True)

In [205]:
dfa.head()

Unnamed: 0,‘fast color’ review: can a gifted family save a parched world?,my sister-in-law is messing up our financial plans,‘little woods’ review: life is thicker than blood,"how to throw a casual, but considered, dinner party at home",one of the deadliest places on the southwest border,"north korea’s state-run economy falters under sanctions, testing elite loyalty","france debates how to rebuild notre-dame, weighing history and modernity","in ‘gentleman jack,’ sally wainwright brings a fascinating life from diary to screen",‘ramy’ is a quietly revolutionary comedy,read barr’s news conference remarks ahead of the mueller report release,...,learning with: ‘the mueller report is 448 pages long. you need to know these 7 key things.’,‘we don’t want conversations to stop’: why gabby giffords is starting a gun control group for gun owners,pop star’s illicit kiss becomes fodder for government ads in hong kong,"house democrats subpoena full mueller report, and the underlying evidence",brooklyn’s fourth avenue gets in gear,"the week in tech: do you prefer free speech, or a perfectly clean internet?",california today: a desert festival not named coachella,a desert festival not named coachella,a duplex penthouse with park views on the market for $11.25 million,once he was the ‘godfather of british crime.’ now he’s just a grandfather.
‘fast color’ review: can a gifted family save a parched world?,0.0,0.0,0.077152,0.0,0.0,0.0,0.0,0.069007,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
my sister-in-law is messing up our financial plans,0.0,0.0,0.0,0.0,0.0,0.076696,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
‘little woods’ review: life is thicker than blood,0.077152,0.0,0.0,0.0,0.0,0.0,0.0,0.074536,0.074536,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"how to throw a casual, but considered, dinner party at home",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.13484,0.0
one of the deadliest places on the southwest border,0.0,0.0,0.0,0.0,0.0,0.0,0.069007,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [197]:
dfa.sort_values("‘fast color’ review: can a gifted family save a parched world?", ascending = False).head()

Unnamed: 0,‘fast color’ review: can a gifted family save a parched world?,my sister-in-law is messing up our financial plans,‘little woods’ review: life is thicker than blood,"how to throw a casual, but considered, dinner party at home",one of the deadliest places on the southwest border,"north korea’s state-run economy falters under sanctions, testing elite loyalty","france debates how to rebuild notre-dame, weighing history and modernity","in ‘gentleman jack,’ sally wainwright brings a fascinating life from diary to screen",‘ramy’ is a quietly revolutionary comedy,read barr’s news conference remarks ahead of the mueller report release,...,learning with: ‘the mueller report is 448 pages long. you need to know these 7 key things.’,‘we don’t want conversations to stop’: why gabby giffords is starting a gun control group for gun owners,pop star’s illicit kiss becomes fodder for government ads in hong kong,"house democrats subpoena full mueller report, and the underlying evidence",brooklyn’s fourth avenue gets in gear,"the week in tech: do you prefer free speech, or a perfectly clean internet?",california today: a desert festival not named coachella,a desert festival not named coachella,a duplex penthouse with park views on the market for $11.25 million,once he was the ‘godfather of british crime.’ now he’s just a grandfather.
‘fast color’ review: can a gifted family save a parched world?,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
why jesus on the cross is no mere symbol,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
what france has money for,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
the data ‘bad moms’ need,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"robert mueller, londonderry, sylvia hatchell: your friday briefing",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [64]:
from time import sleep
from collections import deque
pars = {"(":")", "{":"}", "[":"]", ")":"(", "}":"{", "]":"["}

def test_parser(totest):
    queue = deque()
    for i in totest:
        
        queue.append(i)

        if len(queue) == 1:
            pass

        elif pars[queue[len(queue)-2]] == i:
            queue.pop()
            queue.pop()
        
    return len(queue)==0

In [65]:
test_parser("[{]}()"), test_parser("{([[]])}")

(False, True)