In [54]:
from pathlib import Path
import pandas as pd
import numpy as np
import re
from copy import deepcopy
import random

## Pre-Processing
* Drop tweet ID and timestamp
* Change letters to all lowercase
* Drop '#' and '@'
* Drop URLs

In [2]:
#grab the tweets from the various text files. Since we don't need to build our model based on originating news outlet, I combined the tweets into one text file
path = r'C:\Users\teris\ml_f23\HW_3\tweets.txt'
tweets = pd.read_table(path, sep = "|", header = None, on_bad_lines='skip')
tweets.head()

Unnamed: 0,0,1,2
0,585891883953496066,Wed Apr 08 19:48:05 +0000 2015,Are you a member of the network? Sign up here ...
1,585876545266286592,Wed Apr 08 18:47:08 +0000 2015,What is palliative care like in India? One GP ...
2,585861945535791106,Wed Apr 08 17:49:07 +0000 2015,Most viewed this week: I loved being a midwife...
3,585859917350731777,Wed Apr 08 17:41:03 +0000 2015,How can technology improve mental health waiti...
4,585844465199407104,Wed Apr 08 16:39:39 +0000 2015,In case you missed it: Why the #NHS shouldn’t ...


In [25]:
#now delete out the tweet id and timestamp
retweets = tweets.drop(columns = [0,1]).drop_duplicates()
retweets = retweets.rename(columns = {0:'ID', 2:'Tweet'})
retweets.head()

Unnamed: 0,Tweet
0,Are you a member of the network? Sign up here ...
1,What is palliative care like in India? One GP ...
2,Most viewed this week: I loved being a midwife...
3,How can technology improve mental health waiti...
4,In case you missed it: Why the #NHS shouldn’t ...


In [26]:
retweets['Clean'] = retweets.apply(lambda row: str(row['Tweet']).lower(),axis=1)
retweets['Clean'] = retweets.apply(lambda row: re.sub("@[A-Za-z0-9_]+","", str(row['Clean'])),axis=1)
retweets['Clean'] = retweets.apply(lambda row: re.sub("#","", str(row['Clean'])),axis=1) #we still want to keep the tagnames, just not the hashtag
retweets['Clean'] = retweets.apply(lambda row: re.sub(r"http\S+","", row['Clean']),axis=1)
retweets['Clean'] = retweets.apply(lambda row: re.sub(r"www.\S+","", row['Clean']),axis=1)

In [27]:
print(retweets.Tweet[0])
print(retweets.Clean[0])

Are you a member of the network? Sign up here for free: https://register.theguardian.com/healthcare-professionals/ #NHS #healthcare
are you a member of the network? sign up here for free:  nhs healthcare


In [28]:
#to make my life easier, I'm going to go ahead and pre-process the tweets to form their specific bag of words 
retweets['Clean'] = retweets.apply(lambda row: row['Clean'].split(),axis=1)
print(retweets.Clean[0])

['are', 'you', 'a', 'member', 'of', 'the', 'network?', 'sign', 'up', 'here', 'for', 'free:', 'nhs', 'healthcare']


## KMeans Algorithm

In [502]:
retweets_short = retweets.head(25)
retweets_short.head()

Unnamed: 0,Tweet,Clean
0,Are you a member of the network? Sign up here ...,"[are, you, a, member, of, the, network?, sign,..."
1,What is palliative care like in India? One GP ...,"[what, is, palliative, care, like, in, india?,..."
2,Most viewed this week: I loved being a midwife...,"[most, viewed, this, week:, i, loved, being, a..."
3,How can technology improve mental health waiti...,"[how, can, technology, improve, mental, health..."
4,In case you missed it: Why the #NHS shouldn’t ...,"[in, case, you, missed, it:, why, the, nhs, sh..."


In [539]:
#turn it into a class to make it all easier 
class kmeansclusters():
    def __init__(self, tweets, k): 
        self.tweets = tweets
        self.n = len(tweets)
        self.k = k
        self.distMatrix = {}
        
        self.clusters = {}
        self.reverse_cluster = {}
        
        self.init_centers = self.centers_setup() #pick the random centers
        #Set up the Jaccard matrix and the Initial Clusters
        self.cluster_setup()
        self.jaccard_matrix()
        
        self.iteration_threshold = 1000
        
    #f(x): jaccard_dist
    #PURPOSE: find the jaccard distance ( 1 - jaccard similarity) between two sets of words
    #OUTPUT: returns a float value between 0 and 1. The closer to 1, the further apart the sets are
    def jaccard_dist(self, setA, setB): 
        setA = set(setA) #intersection and union won't work on lists, must be set format
        setB = set(setB)
        intersection_AB = len(setA.intersection(setB))
        union_AB = len(setA.union(setB))
        return 1 - (intersection_AB/union_AB)
    
    #f(x): jaccard_matrix
    #PURPOSE: create a distance matrix where each tweet is a row and a column. This matrix will help us keep track of which two clusters are closest.
            # This idea is similar to the matrices we used to show which nodes are connected in graphs.
    #OUTPUT: none - the goal is simply to initialize the matrix
    def jaccard_matrix (self):
        for ptA in self.tweets.Clean.index: 
            self.distMatrix[ptA] = {}
            setA = self.tweets.Clean[ptA]
            for ptB in self.tweets.Clean.index: 
                if ptB not in self.distMatrix: 
                    self.distMatrix[ptB] = {}
                setB = self.tweets.Clean[ptB]
                dist_AB = self.jaccard_dist(setA, setB)
                self.distMatrix[ptA][ptB] = dist_AB
                self.distMatrix[ptB][ptA] = dist_AB
    
    #f(x): centers_setup
    #PURPOSE: Randomly choose k tweets to be the initial centers of the cluster using their indices
    #OUTPUT: The k indices of the centers 
    def centers_setup (self): 
        init_centers = self.tweets.sample(self.k, replace = False, weights = None, axis = 0).index
        return np.array(init_centers)
    
    #f(x): cluster_setup
    #PURPOSE: Once the centers have been assigned, initialize the clusters to that center
    #OUTPUT: none
    def cluster_setup (self): 
        
        for tweet in self.tweets.Clean.index: 
            self.reverse_cluster[tweet] = -1 # initially each tweet has no cluster assigned to it
        
        for val in range(self.k):
            self.clusters[val] = {self.init_centers[val]}

            self.reverse_cluster[self.init_centers[val]] = self.init_centers[val]
    #f(x): cluster_update
    #PURPOSE: update the clusters based on who is closes to the new centers
    #OUTPUT: the new clusters and their reverses 
    def cluster_update(self): 
        new_cluster = {}
        new_rev_cluster = {}

        for val in range(self.k): 
            new_cluster[val] = set()
        for ptA in self.tweets.Clean.index: 
            min_distance =  np.inf
            min_cluster = self.reverse_cluster[ptA] #min distance is current value
            
            #FIX THIS MINIMIZATION
            for cluster in self.clusters:
                counter = 0.00 
                cluster_dist = []
                for ptB in self.clusters: 
                    new_dist = self.distMatrix[ptA][ptB]
                    #if the distance for that point to the specified cluster is lower than the current minimum point 
                    #then update the distance and assign it to that cluster instead of using an average metric
                    counter += 1.00
                    if counter >0.00: 
                        if min_distance > new_dist: 
                            min_distance = new_dist
                            min_cluster = ptB
            new_cluster[min_cluster].add(ptA)
            new_rev_cluster[ptA] = min_cluster
            
        return new_cluster, new_rev_cluster
   
    def iteration(self):
        rounds = 1
        new_cluster, new_rev_cluster = self.cluster_update()
        self.clusters = deepcopy(new_cluster)
        self.reverse_cluster = deepcopy(new_rev_cluster)
        
        while rounds < self.iteration_threshold: 
            new_cluster = self.cluster_update()
            rounds += 1
            
            if self.clusters != new_cluster: 
                self.clusters = deepcopy(new_cluster)
                self.reverse_cluster = deepcopy(new_rev_cluster)
                #self.center_update()
            else: 
                return 

    def sse_calc(self):
        sum_dist = 0.0
        for center in self.clusters: 
            for pt in self.clusters[center]: 
                tmp_dist = self.distMatrix[center][pt]
                sum_dist += tmp_dist**2
        return sum_dist
    
    def cluster_print(self):
        sse = self.sse_calc()
        print("Sum Squared Distance/SSE: ", sse)
        print(self.clusters) 
            
        

In [540]:

k = 3

kmeans = kmeansclusters(retweets_short, k)
kmeans.iteration()
print(kmeans.clusters)
#kmeans.cluster_print()

TypeError: unhashable type: 'dict'

## Works Cited

 * For loop to read all data files - https://stackoverflow.com/questions/20906474/import-multiple-csv-files-into-pandas-and-concatenate-into-one-dataframe
 
 * How to calculate Jaccard similarity - https://www.geeksforgeeks.org/how-to-calculate-jaccard-similarity-in-python/#
 
 * Cleaning and Tokenization of texts - https://www.kaggle.com/code/tariqsays/tweets-cleaning-with-python

path = r'C:\Users\teris\ml_f23\HW_3\Tweets'  # or unix / linux / mac path

# Get the files from the path provided in the OP
files = Path(path).glob('*.txt')  # .rglob to get subdirectories

dfs = list()
for f in files:
    print(f)
    data = pd.read_table(f, sep = "|", header = None)
    print(1)
    # .stem is method for pathlib objects to get the filename w/o the extension
    data['file'] = f.stem
    print(2)
    dfs.append(data)
    print(3)

df = pd.concat(dfs, ignore_index=True)

test_set1 = retweets.Clean[0] 
test_set2 = retweets.Clean[1]

print(test_set1)
print(test_set2)

test_distance = jaccard_dist(test_set1, test_set2)
print (test_distance)

In [None]:
 
        #f(x): cluster_update
    #PURPOSE: update the clusters based on who is closes to the new centers
    #OUTPUT: the new clusters and their reverses 
    def center_update(self): 
        new_cluster = {}
        for group_name in self.clusters[0]:
            distA = np.zeros(len(self.clusters[0][group_name]))
            for ptA in self.clusters[0][group_name]:
                print(cluster)
                b_count = 0
                for ptB in self.clusters[0][group_name]:
                    distA[b_count] += self.distMatrix[ptA][ptB]
                    b_count ++
            #distA = np.empty(len(cluster[center])) 
            #for ptA in cluster[center]():
                #print(ptA)
                #b_count = 0 
                #for ptB in cluster[center]:
                    #distA[b_count] += self.distMatrix[ptA][ptB]
                    #b_count += 1
                #min_center = np.where(distA == min(distA))[0][0]
                #new_cluster[center] = min_center #finds new center 
        #self.cluster = deepcopy(new_cluster)
          
        return
        