In [2]:
"""
Inputs a csv, reads the portion of each line after datetime.
These data are mapped to embeddings and then s-bert agglomerative clustering with a threshold is applied.
"""
from sentence_transformers import SentenceTransformer
from sklearn.cluster import AgglomerativeClustering
import numpy as np

In [3]:
embedder = SentenceTransformer('paraphrase-MiniLM-L6-v2')

coilist = []

#read data from csv
with open('cois.json','r') as f: #insert csv name at statecoi.csv
    lines = f.readlines()
    lines.pop(0)                    #pops header
    for line in lines:
        if line[0 : 1].isdigit():
            coilist.append(line)
        else:
            x= len(coilist)
            coilist[x-1] = coilist[x-1] + ' ' + line
f.close()

In [4]:
corpus = []

#get everything after datetime
for coi in coilist:
    count = 10
    for i in range(0, len(coi)-1):
        if coi[i] == ',' and count == 0:
            corpus.append(coi[i:])
            break
        elif coi[i] == ',':
            count = count-1
        else:
            continue
corpus_embeddings = embedder.encode(corpus)

# Normalize the embeddings to unit length
corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

# Perform kmean clustering – change distance_threshold to change number of clusters
clustering_model = AgglomerativeClustering(n_clusters=None, distance_threshold=2.0)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in clustered_sentences.items():
    print("Cluster ", i+1)
    print(cluster)
    for coi in cluster:
        words = coi.split(' ')
    print("")

Cluster  15
[',,"Working class and higher proportion of renters. Many people who work at the airport, hotels, and hospitality jobs. They want public transit. ",\n', ',,"concerned about lack of hospitals, kids\' education, middle class families. About 1/3 renters. A significant  population are concerned about pollution. ",\n', ',,,"i describe 3 parts of town: student (blue), low income (yellow, high income (green).",\n', ',upper-income,highvalue-homes",,"areas with homes worth more than $500,000",\n', ',"smi,poverty,housing,low income",,This community experiences poverty and has substandard housing for low-income people.,\n', ',professional class,middle class,apartments",,These are neighborhoods that are core middle class professional live in. Some parts have a large community of children/school aged adolescents. Many of the people that live in this area are young adults that have recently graduated college or are in the early years of their careers. This area is not composed of mainly 

In [7]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
embeddings = model.encode(clustered_sentences, show_progress_bar=True)

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
import umap


umap_embeddings = umap.UMAP(n_neighbors=15, 
                            n_components=5, 
                            metric='cosine').fit_transform(embeddings)



In [12]:
import hdbscan
cluster = hdbscan.HDBSCAN(min_cluster_size=15,
                          metric='euclidean',                      
                          cluster_selection_method='eom').fit(umap_embeddings)

In [15]:
for i, cluster in clustered_sentences.items():
    print(model.get_topic{cluster})


SyntaxError: invalid syntax (<ipython-input-15-c20fbbe19f57>, line 2)

In [13]:
import matplotlib.pyplot as plt

# Prepare data
umap_data = umap.UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
result = pd.DataFrame(umap_data, columns=['x', 'y'])
result['labels'] = cluster.labels_

# Visualize clusters
fig, ax = plt.subplots(figsize=(20, 10))
outliers = result.loc[result.labels == -1, :]
clustered = result.loc[result.labels != -1, :]
plt.scatter(outliers.x, outliers.y, color='#BDBDBD', s=0.05)
plt.scatter(clustered.x, clustered.y, c=clustered.labels, s=0.05, cmap='hsv_r')
plt.colorbar()

NameError: name 'pd' is not defined