In [2]:
import numpy as np
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pylab as plt

class ClusterAnalysis:
    def __init__(self):
        self.id_user = dict()
        self.user_id = dict()
        self.id_item = dict()
        self.item_id = dict()
        self.item_price = dict()
        self.user_gamingsession = dict()
        self.item_low_price = dict()
        self.item_medium_price = dict()
        self.item_high_price = dict()
        self.user_low_engagement = dict()
        self.user_medium_engagement = dict()
        self.user_high_engagement = dict()
        

    def read_data(self, dirpath):
        with open(dirpath + "/item_price.txt") as pdictfile:
            for line in pdictfile:
                toks = line.strip().split("\t")
                if len(toks) == 2:
                    self.item_price[toks[0]] = int(toks[1].replace(" ", ""))    # {itemid1: price1, itemid2: price2....}

        print("#items", len(self.item_price))

        with open(dirpath + "/user_gaming_sessions.txt") as gdictfile:
            for line in gdictfile:
                toks = line.strip().split("\t")
                if len(toks) == 2:
                    self.user_gamingsession[toks[0]] = int(toks[1].replace(" ", ""))    # {uid1: CountofGames1....}

        print("#users", len(self.user_gamingsession))
        
        with open(dirpath + "/id_user.txt") as udictfile:
            for line in udictfile:
                toks = line.strip().split("\t")
                if len(toks) == 2:
                    self.id_user[toks[0]] = toks[1].replace(" ", "")    # {uid1: User1, uid2: User2....}
                    self.user_id[toks[1]] = toks[0].replace(" ", "")    # {User1: uid1, User2: uid2....}

        
        with open(dirpath + "/id_item.txt") as idictfile:
            for line in idictfile:
                toks = line.strip().split("\t")
                if len(toks) == 2:
                    self.id_item[toks[0]] = toks[1].replace(" ", "")    # {itemid1: Item1, itemid2: Item2....}
                    self.item_id[toks[1]] = toks[0].replace(" ", "")    # {Item1: itemid1, Item2: itemid2....}

        
    def create_item_clusters(self):
        for item in self.item_price:
            if self.item_price[item] <= 50:
                self.item_low_price[item] = self.item_price[item]
            elif self.item_price[item] > 50 and self.item_price[item] < 600:
                self.item_medium_price[item] = self.item_price[item]
            else:
                self.item_high_price[item] = self.item_price[item]
        print("#items with low price (<=$1)", len(self.item_low_price))
        print("#items with mid price (between $1 and $12)", len(self.item_medium_price))
        print("#items with high price (>$12)", len(self.item_high_price))
        
    def create_user_clusters(self):
        for user in self.user_gamingsession:
            if self.user_gamingsession[user] <= 5:
                self.user_low_engagement[user] = self.user_gamingsession[user]
            elif self.user_gamingsession[user] > 5 and self.user_gamingsession[user] <= 100:
                self.user_medium_engagement[user] = self.user_gamingsession[user]
            else:
                self.user_high_engagement[user] = self.user_gamingsession[user]
        print("#users with low engagement (less than 5 gaming sessions)", len(self.user_low_engagement))
        print("#users with medium engagement (between 5 and 100 gaming sessions)", len(self.user_medium_engagement))
        print("#users with high engagement (more than 100 gaming sessions)", len(self.user_high_engagement))
        
    def read_latent_output(self, outputfilename):
        self.user_vec = dict()
        self.game_vec = dict()
        self.item_vec = dict()
        with open(outputfilename) as nvfile:             
            for line in nvfile:
                toks = line.strip().split(" ")
                if len(toks) == 129:
                    node = toks[0]
                    del toks[0]
                    if node[0] == 'u': 
                        self.user_vec[self.user_id[node]] = toks    # {uid1: [user_vector_values], uid2: [user_vector_values]..}
                    elif node[0] == 'i':
                        self.item_vec[self.item_id[node]] = toks    # {itemid1: [item_vector_values], itemid2: [item_vector_values]..}
        print("#Items having vectors:", len(self.item_vec))
        
    def get_centroid_measure(self, cluster_vec):
        cluster_array = np.array(cluster_vec, dtype=float)
        centroid = np.mean(cluster_array, axis = 0, dtype=float)
        return centroid
    
    def get_cluster_distances(self):
        low_user_cluster = []
        medium_user_cluster = []
        high_user_cluster = []
        low_item_cluster = []
        medium_item_cluster = []
        high_item_cluster = []
        for user in self.user_low_engagement:
            uservec = self.user_vec.get(user,[])
            if (uservec != []):
                low_user_cluster.append(uservec)
        centroid_low_user_cluster = self.get_centroid_measure(low_user_cluster)
        for user in self.user_medium_engagement:
            uservec = self.user_vec.get(user,[])
            if (uservec != []):
                medium_user_cluster.append(uservec)
        centroid_medium_user_cluster = self.get_centroid_measure(medium_user_cluster)
        for user in self.user_high_engagement:
            uservec = self.user_vec.get(user,[])
            if (uservec != []):
                high_user_cluster.append(uservec)
        centroid_high_user_cluster = self.get_centroid_measure(high_user_cluster)
        for item in self.item_low_price:
            itemvec = self.item_vec.get(item, [])
            if (itemvec != []):
                low_item_cluster.append(itemvec)
        centroid_low_item_cluster = self.get_centroid_measure(low_item_cluster)
        for item in self.item_medium_price:
            itemvec = self.item_vec.get(item, [])
            if (itemvec != []):
                medium_item_cluster.append(itemvec)
        centroid_medium_item_cluster = self.get_centroid_measure(medium_item_cluster)
        for item in self.item_high_price:
            itemvec = self.item_vec.get(item, [])
            if (itemvec != []):
                high_item_cluster.append(itemvec)
        centroid_high_item_cluster = self.get_centroid_measure(high_item_cluster)
        print("\n Results based on Engagement level of users and Item Prices\n")
        print("Distance between Least Engaged Users and Low Price Items:",round(np.linalg.norm(centroid_low_user_cluster-centroid_low_item_cluster),3))
        print("Distance between Least Engaged Users and Medium Price Items:",round(np.linalg.norm(centroid_low_user_cluster-centroid_medium_item_cluster),3))
        print("Distance between Least Engaged Users and High Price Items:",round(np.linalg.norm(centroid_low_user_cluster-centroid_high_item_cluster),3))
        print("\n")
        print("Distance between Medium Engaged Users and Low Price Items:",round(np.linalg.norm(centroid_medium_user_cluster-centroid_low_item_cluster),3))
        print("Distance between Medium Engaged Users and Medium Price Items:",round(np.linalg.norm(centroid_medium_user_cluster-centroid_medium_item_cluster),3))
        print("Distance between Medium Engaged Users and High Price Items:",round(np.linalg.norm(centroid_medium_user_cluster-centroid_high_item_cluster),3))
        print("\n")
        print("Distance between Highly Engaged Users and Low Price Items:",round(np.linalg.norm(centroid_high_user_cluster-centroid_low_item_cluster),3))
        print("Distance between Highly Engaged Users and Medium Price Items:",round(np.linalg.norm(centroid_high_user_cluster-centroid_medium_item_cluster),3))
        print("Distance between Highly Engaged Users and High Price Items:",round(np.linalg.norm(centroid_high_user_cluster-centroid_high_item_cluster),3))
        
    def get_k_means_centroid(self, cluster_vec):
        cluster_array = np.array(cluster_vec, dtype=float)
        kmeans = KMeans(n_clusters=3, init='k-means++', n_jobs=4).fit(cluster_array)
        centroids = kmeans.cluster_centers_
        return centroids
    
    def get_k_means_cluster_distances(self):
        all_user_cluster = []
        all_item_cluster = []
        for user in self.user_vec:
            uservec = self.user_vec[user]
            all_user_cluster.append(uservec)
        centroid_users = self.get_k_means_centroid(all_user_cluster)
        for item in self.item_vec:
            itemvec = self.item_vec[item]
            all_item_cluster.append(itemvec)
        centroid_items = self.get_k_means_centroid(all_item_cluster)
        print("\n k-means Cluster Distance Results\n")
        print("Distance between User Cluster 0 and Item Cluster 0:",round(np.linalg.norm(centroid_users[0]-centroid_items[0]),3))
        print("Distance between User Cluster 0 and Item Cluster 1:",round(np.linalg.norm(centroid_users[0]-centroid_items[1]),3))
        print("Distance between User Cluster 0 and Item Cluster 2:",round(np.linalg.norm(centroid_users[0]-centroid_items[2]),3))
        print("\n")
        print("Distance between User Cluster 1 and Item Cluster 0:",round(np.linalg.norm(centroid_users[1]-centroid_items[0]),3))
        print("Distance between User Cluster 1 and Item Cluster 1:",round(np.linalg.norm(centroid_users[1]-centroid_items[1]),3))
        print("Distance between User Cluster 1 and Item Cluster 2:",round(np.linalg.norm(centroid_users[1]-centroid_items[2]),3))
        print("\n")
        print("Distance between User Cluster 2 and Item Cluster 0:",round(np.linalg.norm(centroid_users[2]-centroid_items[0]),3))
        print("Distance between User Cluster 2 and Item Cluster 1:",round(np.linalg.norm(centroid_users[2]-centroid_items[1]),3))
        print("Distance between User Cluster 2 and Item Cluster 2:",round(np.linalg.norm(centroid_users[2]-centroid_items[2]),3))

        
dirpath = 'C:/Vidit/PhD/RA Work/KZ - RA/Gaming Project/data/Files'
outputfilename = 'C:/Vidit/PhD/RA Work/KZ - RA/Gaming Project/data/Latent Vector Representation/Gaming_output_metapath_UGIGU_w50_l20.txt'

def main():
    ca = ClusterAnalysis()
    ca.read_data(dirpath)
    ca.create_item_clusters()
    ca.create_user_clusters()
    ca.read_latent_output(outputfilename)
    ca.get_cluster_distances()
    ca.get_k_means_cluster_distances()
    print("\nSuccess")


if __name__ == "__main__":
    main()
            

#items 292
#users 198144
#items with low price (<=$1) 24
#items with mid price (between $1 and $12) 193
#items with high price (>$12) 75
#users with low engagement (less than 5 gaming sessions) 42405
#users with medium engagement (between 5 and 100 gaming sessions) 136449
#users with high engagement (more than 100 gaming sessions) 19290
#Items having vectors: 1450

 Results based on Engagement level of users and Item Prices

Distance between Least Engaged Users and Low Price Items: 3.97
Distance between Least Engaged Users and Medium Price Items: 4.001
Distance between Least Engaged Users and High Price Items: 4.035


Distance between Medium Engaged Users and Low Price Items: 3.961
Distance between Medium Engaged Users and Medium Price Items: 3.979
Distance between Medium Engaged Users and High Price Items: 4.007


Distance between Highly Engaged Users and Low Price Items: 4.05
Distance between Highly Engaged Users and Medium Price Items: 4.037
Distance between Highly Engaged Users and

In [3]:
import numpy as np
from sklearn.cluster import KMeans

class ClusterAnalysis:
    def __init__(self):
        self.id_user = dict()
        self.user_id = dict()
        self.id_item = dict()
        self.item_id = dict()
        self.item_price = dict()
        self.user_gamingsession = dict()
        self.item_low_price = dict()
        self.item_medium_price = dict()
        self.item_high_price = dict()
        self.user_low_engagement = dict()
        self.user_medium_engagement = dict()
        self.user_high_engagement = dict()
        

    def read_data(self, dirpath):
        with open(dirpath + "/item_price.txt") as pdictfile:
            for line in pdictfile:
                toks = line.strip().split("\t")
                if len(toks) == 2:
                    self.item_price[toks[0]] = int(toks[1].replace(" ", ""))    # {itemid1: price1, itemid2: price2....}

        print("#items", len(self.item_price))

        with open(dirpath + "/user_gaming_sessions.txt") as gdictfile:
            for line in gdictfile:
                toks = line.strip().split("\t")
                if len(toks) == 2:
                    self.user_gamingsession[toks[0]] = int(toks[1].replace(" ", ""))    # {uid1: CountofGames1....}

        print("#users", len(self.user_gamingsession))
        
        with open(dirpath + "/id_user.txt") as udictfile:
            for line in udictfile:
                toks = line.strip().split("\t")
                if len(toks) == 2:
                    self.id_user[toks[0]] = toks[1].replace(" ", "")    # {uid1: User1, uid2: User2....}
                    self.user_id[toks[1]] = toks[0].replace(" ", "")    # {User1: uid1, User2: uid2....}

        
        with open(dirpath + "/id_item.txt") as idictfile:
            for line in idictfile:
                toks = line.strip().split("\t")
                if len(toks) == 2:
                    self.id_item[toks[0]] = toks[1].replace(" ", "")    # {itemid1: Item1, itemid2: Item2....}
                    self.item_id[toks[1]] = toks[0].replace(" ", "")    # {Item1: itemid1, Item2: itemid2....}

        
    def create_item_clusters(self):
        for item in self.item_price:
            if self.item_price[item] <= 50:
                self.item_low_price[item] = self.item_price[item]
            elif self.item_price[item] > 50 and self.item_price[item] < 600:
                self.item_medium_price[item] = self.item_price[item]
            else:
                self.item_high_price[item] = self.item_price[item]
        print("#items with low price (<=$1)", len(self.item_low_price))
        print("#items with mid price (between $1 and $12)", len(self.item_medium_price))
        print("#items with high price (>$12)", len(self.item_high_price))
        
    def create_user_clusters(self):
        for user in self.user_gamingsession:
            if self.user_gamingsession[user] <= 5:
                self.user_low_engagement[user] = self.user_gamingsession[user]
            elif self.user_gamingsession[user] > 5 and self.user_gamingsession[user] <= 100:
                self.user_medium_engagement[user] = self.user_gamingsession[user]
            else:
                self.user_high_engagement[user] = self.user_gamingsession[user]
        print("#users with low engagement (less than 5 gaming sessions)", len(self.user_low_engagement))
        print("#users with medium engagement (between 5 and 100 gaming sessions)", len(self.user_medium_engagement))
        print("#users with high engagement (more than 100 gaming sessions)", len(self.user_high_engagement))
        
    def read_latent_output(self, outputfilename):
        self.user_vec = dict()
        self.game_vec = dict()
        self.item_vec = dict()
        with open(outputfilename) as nvfile:             
            for line in nvfile:
                toks = line.strip().split(" ")
                if len(toks) == 129:
                    node = toks[0]
                    del toks[0]
                    if node[0] == 'u': 
                        self.user_vec[self.user_id[node]] = toks    # {uid1: [user_vector_values], uid2: [user_vector_values]..}
                    elif node[0] == 'i':
                        self.item_vec[self.item_id[node]] = toks    # {itemid1: [item_vector_values], itemid2: [item_vector_values]..}
        
    def get_centroid_measure(self, cluster_vec):
        cluster_array = np.array(cluster_vec, dtype=float)
        centroid = np.mean(cluster_array, axis = 0, dtype=float)
        return centroid
    
    def get_cluster_distances(self):
        low_user_cluster = []
        medium_user_cluster = []
        high_user_cluster = []
        low_item_cluster = []
        medium_item_cluster = []
        high_item_cluster = []
        for user in self.user_low_engagement:
            uservec = self.user_vec.get(user,[])
            if (uservec != []):
                low_user_cluster.append(uservec)
        centroid_low_user_cluster = self.get_centroid_measure(low_user_cluster)
        for user in self.user_medium_engagement:
            uservec = self.user_vec.get(user,[])
            if (uservec != []):
                medium_user_cluster.append(uservec)
        centroid_medium_user_cluster = self.get_centroid_measure(medium_user_cluster)
        for user in self.user_high_engagement:
            uservec = self.user_vec.get(user,[])
            if (uservec != []):
                high_user_cluster.append(uservec)
        centroid_high_user_cluster = self.get_centroid_measure(high_user_cluster)
        for item in self.item_low_price:
            itemvec = self.item_vec.get(item, [])
            if (itemvec != []):
                low_item_cluster.append(itemvec)
        centroid_low_item_cluster = self.get_centroid_measure(low_item_cluster)
        for item in self.item_medium_price:
            itemvec = self.item_vec.get(item, [])
            if (itemvec != []):
                medium_item_cluster.append(itemvec)
        centroid_medium_item_cluster = self.get_centroid_measure(medium_item_cluster)
        for item in self.item_high_price:
            itemvec = self.item_vec.get(item, [])
            if (itemvec != []):
                high_item_cluster.append(itemvec)
        centroid_high_item_cluster = self.get_centroid_measure(high_item_cluster)
        print("\n Results based on Engagement level of users and Item Prices\n")
        print("Distance between Least Engaged Users and Low Price Items:",round(np.linalg.norm(centroid_low_user_cluster-centroid_low_item_cluster),3))
        print("Distance between Least Engaged Users and Medium Price Items:",round(np.linalg.norm(centroid_low_user_cluster-centroid_medium_item_cluster),3))
        print("Distance between Least Engaged Users and High Price Items:",round(np.linalg.norm(centroid_low_user_cluster-centroid_high_item_cluster),3))
        print("\n")
        print("Distance between Medium Engaged Users and Low Price Items:",round(np.linalg.norm(centroid_medium_user_cluster-centroid_low_item_cluster),3))
        print("Distance between Medium Engaged Users and Medium Price Items:",round(np.linalg.norm(centroid_medium_user_cluster-centroid_medium_item_cluster),3))
        print("Distance between Medium Engaged Users and High Price Items:",round(np.linalg.norm(centroid_medium_user_cluster-centroid_high_item_cluster),3))
        print("\n")
        print("Distance between Highly Engaged Users and Low Price Items:",round(np.linalg.norm(centroid_high_user_cluster-centroid_low_item_cluster),3))
        print("Distance between Highly Engaged Users and Medium Price Items:",round(np.linalg.norm(centroid_high_user_cluster-centroid_medium_item_cluster),3))
        print("Distance between Highly Engaged Users and High Price Items:",round(np.linalg.norm(centroid_high_user_cluster-centroid_high_item_cluster),3))
        
    def get_k_means_centroid(self, cluster_vec):
        cluster_array = np.array(cluster_vec, dtype=float)
        kmeans = KMeans(n_clusters=3, init='k-means++', n_jobs=4).fit(cluster_array)
        centroids = kmeans.cluster_centers_
        labels = kmeans.labels_
        result = zip(labels, cluster_vec)
        sortedR = sorted(result, key=lambda x: x[0])
        return centroids, sortedR
    
    def get_k_means_cluster_distances(self):
        all_user_cluster = []
        all_item_cluster = []
        a = dict()
        b = dict()
        c = dict()
        d = dict()
        for user in self.user_vec:
            uservec = self.user_vec[user]
            all_user_cluster.append(uservec)
        centroid_users, sortedU = self.get_k_means_centroid(all_user_cluster)
        for cluster, uvec in sortedU:
            for uid, vec in self.user_vec.items():
                if uvec == vec:
                    if cluster not in a:
                        a[cluster] = []
                    a[cluster].append(uid)
                    if uid not in b:
                        b[uid] = cluster
        for item in self.item_vec:
            itemvec = self.item_vec[item]
            all_item_cluster.append(itemvec)
        centroid_items, sortedI = self.get_k_means_centroid(all_item_cluster)
        for cluster, ivec in sortedI:
            for itemid, vec in self.item_vec.items():
                if ivec == vec:
                    if cluster not in c:
                        c[cluster] = []
                    c[cluster].append(itemid)
                    if itemid not in d:
                        d[itemid] = cluster
        print("\n k-means Cluster Distance Results\n")
        print("Distance between User Cluster 1 and Item Cluster 1:",round(np.linalg.norm(centroid_users[0]-centroid_items[0]),3))
        print("Distance between User Cluster 1 and Item Cluster 2:",round(np.linalg.norm(centroid_users[0]-centroid_items[1]),3))
        print("Distance between User Cluster 1 and Item Cluster 3:",round(np.linalg.norm(centroid_users[0]-centroid_items[2]),3))
        print("\n")
        print("Distance between User Cluster 2 and Item Cluster 1:",round(np.linalg.norm(centroid_users[1]-centroid_items[0]),3))
        print("Distance between User Cluster 2 and Item Cluster 2:",round(np.linalg.norm(centroid_users[1]-centroid_items[1]),3))
        print("Distance between User Cluster 2 and Item Cluster 3:",round(np.linalg.norm(centroid_users[1]-centroid_items[2]),3))
        print("\n")
        print("Distance between User Cluster 3 and Item Cluster 1:",round(np.linalg.norm(centroid_users[2]-centroid_items[0]),3))
        print("Distance between User Cluster 3 and Item Cluster 2:",round(np.linalg.norm(centroid_users[2]-centroid_items[1]),3))
        print("Distance between User Cluster 3 and Item Cluster 3:",round(np.linalg.norm(centroid_users[2]-centroid_items[2]),3))
        file1 = 'C:/Vidit/PhD/RA Work/KZ - RA/Gaming Project/data/Files/cluster_userid_sessions.txt'
        file2 = 'C:/Vidit/PhD/RA Work/KZ - RA/Gaming Project/data/Files/cluster_item_prices.txt'
        with open(file1,'w') as f1:
            for cluster in a:
                users = a[cluster]
                for user in users:
                    f1.write(str(cluster) + "\t" + str(user) + "\t" + str(self.user_gamingsession.get(user,0)) + "\n")
        f1.close()
        with open(file2,'w') as f2:
            for cluster in c:
                items = c[cluster]
                for item in items:
                    f2.write(str(cluster) + "\t" + str(item) + "\t" + str(self.item_price.get(item,0)) + "\n")
        f2.close()
        
dirpath = 'C:/Vidit/PhD/RA Work/KZ - RA/Gaming Project/data/Files'
outputfilename = 'C:/Vidit/PhD/RA Work/KZ - RA/Gaming Project/data/Latent Vector Representation/Gaming_output_metapath_UGIGU_w50_l20.txt'

def main():
    ca = ClusterAnalysis()
    ca.read_data(dirpath)
    ca.create_item_clusters()
    ca.create_user_clusters()
    ca.read_latent_output(outputfilename)
    ca.get_cluster_distances()
    ca.get_k_means_cluster_distances()
    print("\nSuccess")


if __name__ == "__main__":
    main()
            

#items 292
#users 198144
#items with low price (<=$1) 24
#items with mid price (between $1 and $12) 193
#items with high price (>$12) 75
#users with low engagement (less than 5 gaming sessions) 42405
#users with medium engagement (between 5 and 100 gaming sessions) 136449
#users with high engagement (more than 100 gaming sessions) 19290

 Results based on Engagement level of users and Item Prices

Distance between Least Engaged Users and Low Price Items: 3.97
Distance between Least Engaged Users and Medium Price Items: 4.001
Distance between Least Engaged Users and High Price Items: 4.035


Distance between Medium Engaged Users and Low Price Items: 3.961
Distance between Medium Engaged Users and Medium Price Items: 3.979
Distance between Medium Engaged Users and High Price Items: 4.007


Distance between Highly Engaged Users and Low Price Items: 4.05
Distance between Highly Engaged Users and Medium Price Items: 4.037
Distance between Highly Engaged Users and High Price Items: 4.043

 k

In [4]:
a = ("John", "Charles", "Mike")
b = ("Jenny", "Christy", "Monica")

x = zip(a, b)
print(list(x))

[('John', 'Jenny'), ('Charles', 'Christy'), ('Mike', 'Monica')]


In [3]:
import numpy as np
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pylab as plt

class ClusterAnalysis:
    def __init__(self):
        self.id_user = dict()
        self.user_id = dict()
        self.id_item = dict()
        self.item_id = dict()
        self.item_price = dict()
        self.user_gamingsession = dict()
        self.item_low_price = dict()
        self.item_medium_price = dict()
        self.item_high_price = dict()
        self.user_low_engagement = dict()
        self.user_medium_engagement = dict()
        self.user_high_engagement = dict()
        

    def read_data(self, dirpath):
        with open(dirpath + "/item_price.txt") as pdictfile:
            for line in pdictfile:
                toks = line.strip().split("\t")
                if len(toks) == 2:
                    self.item_price[toks[0]] = int(toks[1].replace(" ", ""))    # {itemid1: price1, itemid2: price2....}

        print("#items", len(self.item_price))

        with open(dirpath + "/user_gaming_sessions.txt") as gdictfile:
            for line in gdictfile:
                toks = line.strip().split("\t")
                if len(toks) == 2:
                    self.user_gamingsession[toks[0]] = int(toks[1].replace(" ", ""))    # {uid1: CountofGames1....}

        print("#users", len(self.user_gamingsession))
        
        with open(dirpath + "/id_user.txt") as udictfile:
            for line in udictfile:
                toks = line.strip().split("\t")
                if len(toks) == 2:
                    self.id_user[toks[0]] = toks[1].replace(" ", "")    # {uid1: User1, uid2: User2....}
                    self.user_id[toks[1]] = toks[0].replace(" ", "")    # {User1: uid1, User2: uid2....}

        
        with open(dirpath + "/id_item.txt") as idictfile:
            for line in idictfile:
                toks = line.strip().split("\t")
                if len(toks) == 2:
                    self.id_item[toks[0]] = toks[1].replace(" ", "")    # {itemid1: Item1, itemid2: Item2....}
                    self.item_id[toks[1]] = toks[0].replace(" ", "")    # {Item1: itemid1, Item2: itemid2....}

        
    def create_item_clusters(self):
        for item in self.item_price:
            if self.item_price[item] <= 50:
                self.item_low_price[item] = self.item_price[item]
            elif self.item_price[item] > 50 and self.item_price[item] < 600:
                self.item_medium_price[item] = self.item_price[item]
            else:
                self.item_high_price[item] = self.item_price[item]
        print("#items with low price (<=$1)", len(self.item_low_price))
        print("#items with mid price (between $1 and $12)", len(self.item_medium_price))
        print("#items with high price (>$12)", len(self.item_high_price))
        
    def create_user_clusters(self):
        for user in self.user_gamingsession:
            if self.user_gamingsession[user] <= 5:
                self.user_low_engagement[user] = self.user_gamingsession[user]
            elif self.user_gamingsession[user] > 5 and self.user_gamingsession[user] <= 100:
                self.user_medium_engagement[user] = self.user_gamingsession[user]
            else:
                self.user_high_engagement[user] = self.user_gamingsession[user]
        print("#users with low engagement (less than 5 gaming sessions)", len(self.user_low_engagement))
        print("#users with medium engagement (between 5 and 100 gaming sessions)", len(self.user_medium_engagement))
        print("#users with high engagement (more than 100 gaming sessions)", len(self.user_high_engagement))
        
    def read_latent_output(self, outputfilename):
        self.user_vec = dict()
        self.game_vec = dict()
        self.item_vec = dict()
        with open(outputfilename) as nvfile:             
            for line in nvfile:
                toks = line.strip().split(" ")
                if len(toks) == 129:
                    node = toks[0]
                    del toks[0]
                    if node[0] == 'u': 
                        self.user_vec[self.user_id[node]] = toks    # {uid1: [user_vector_values], uid2: [user_vector_values]..}
                    elif node[0] == 'i':
                        self.item_vec[self.item_id[node]] = toks    # {itemid1: [item_vector_values], itemid2: [item_vector_values]..}
        print("#Items having vectors:", len(self.item_vec))
        
    def get_centroid_measure(self, cluster_vec):
        cluster_array = np.array(cluster_vec, dtype=float)
        centroid = np.mean(cluster_array, axis = 0, dtype=float)
        return centroid
    
    def get_cluster_distances(self):
        low_user_cluster = []
        medium_user_cluster = []
        high_user_cluster = []
        low_item_cluster = []
        medium_item_cluster = []
        high_item_cluster = []
        for user in self.user_low_engagement:
            uservec = self.user_vec.get(user,[])
            if (uservec != []):
                low_user_cluster.append(uservec)
        centroid_low_user_cluster = self.get_centroid_measure(low_user_cluster)
        for user in self.user_medium_engagement:
            uservec = self.user_vec.get(user,[])
            if (uservec != []):
                medium_user_cluster.append(uservec)
        centroid_medium_user_cluster = self.get_centroid_measure(medium_user_cluster)
        for user in self.user_high_engagement:
            uservec = self.user_vec.get(user,[])
            if (uservec != []):
                high_user_cluster.append(uservec)
        centroid_high_user_cluster = self.get_centroid_measure(high_user_cluster)
        for item in self.item_low_price:
            itemvec = self.item_vec.get(item, [])
            if (itemvec != []):
                low_item_cluster.append(itemvec)
        centroid_low_item_cluster = self.get_centroid_measure(low_item_cluster)
        for item in self.item_medium_price:
            itemvec = self.item_vec.get(item, [])
            if (itemvec != []):
                medium_item_cluster.append(itemvec)
        centroid_medium_item_cluster = self.get_centroid_measure(medium_item_cluster)
        for item in self.item_high_price:
            itemvec = self.item_vec.get(item, [])
            if (itemvec != []):
                high_item_cluster.append(itemvec)
        centroid_high_item_cluster = self.get_centroid_measure(high_item_cluster)
        print("\n Results based on Engagement level of users and Item Prices\n")
        print("Distance between Least Engaged Users and Low Price Items:",round(np.linalg.norm(centroid_low_user_cluster-centroid_low_item_cluster),3))
        print("Distance between Least Engaged Users and Medium Price Items:",round(np.linalg.norm(centroid_low_user_cluster-centroid_medium_item_cluster),3))
        print("Distance between Least Engaged Users and High Price Items:",round(np.linalg.norm(centroid_low_user_cluster-centroid_high_item_cluster),3))
        print("\n")
        print("Distance between Medium Engaged Users and Low Price Items:",round(np.linalg.norm(centroid_medium_user_cluster-centroid_low_item_cluster),3))
        print("Distance between Medium Engaged Users and Medium Price Items:",round(np.linalg.norm(centroid_medium_user_cluster-centroid_medium_item_cluster),3))
        print("Distance between Medium Engaged Users and High Price Items:",round(np.linalg.norm(centroid_medium_user_cluster-centroid_high_item_cluster),3))
        print("\n")
        print("Distance between Highly Engaged Users and Low Price Items:",round(np.linalg.norm(centroid_high_user_cluster-centroid_low_item_cluster),3))
        print("Distance between Highly Engaged Users and Medium Price Items:",round(np.linalg.norm(centroid_high_user_cluster-centroid_medium_item_cluster),3))
        print("Distance between Highly Engaged Users and High Price Items:",round(np.linalg.norm(centroid_high_user_cluster-centroid_high_item_cluster),3))
        
    def get_k_means_centroid(self, cluster_vec):
        cluster_array = np.array(cluster_vec, dtype=float)
        kmeans = KMeans(n_clusters=4, init='k-means++', n_jobs=4).fit(cluster_array)
        centroids = kmeans.cluster_centers_
        return centroids
    
    def get_k_means_cluster_distances(self):
        all_user_cluster = []
        all_item_cluster = []
        for user in self.user_vec:
            uservec = self.user_vec[user]
            all_user_cluster.append(uservec)
        centroid_users = self.get_k_means_centroid(all_user_cluster)
        for item in self.item_vec:
            itemvec = self.item_vec[item]
            all_item_cluster.append(itemvec)
        centroid_items = self.get_k_means_centroid(all_item_cluster)
        print("\n k-means Cluster Distance Results\n")
        print("Distance between User Cluster 0 and Item Cluster 0:",round(np.linalg.norm(centroid_users[0]-centroid_items[0]),3))
        print("Distance between User Cluster 0 and Item Cluster 1:",round(np.linalg.norm(centroid_users[0]-centroid_items[1]),3))
        print("Distance between User Cluster 0 and Item Cluster 2:",round(np.linalg.norm(centroid_users[0]-centroid_items[2]),3))
        print("Distance between User Cluster 0 and Item Cluster 3:",round(np.linalg.norm(centroid_users[0]-centroid_items[3]),3))
        print("\n")
        print("Distance between User Cluster 1 and Item Cluster 0:",round(np.linalg.norm(centroid_users[1]-centroid_items[0]),3))
        print("Distance between User Cluster 1 and Item Cluster 1:",round(np.linalg.norm(centroid_users[1]-centroid_items[1]),3))
        print("Distance between User Cluster 1 and Item Cluster 2:",round(np.linalg.norm(centroid_users[1]-centroid_items[2]),3))
        print("Distance between User Cluster 1 and Item Cluster 3:",round(np.linalg.norm(centroid_users[1]-centroid_items[3]),3))
        print("\n")
        print("Distance between User Cluster 2 and Item Cluster 0:",round(np.linalg.norm(centroid_users[2]-centroid_items[0]),3))
        print("Distance between User Cluster 2 and Item Cluster 1:",round(np.linalg.norm(centroid_users[2]-centroid_items[1]),3))
        print("Distance between User Cluster 2 and Item Cluster 2:",round(np.linalg.norm(centroid_users[2]-centroid_items[2]),3))
        print("Distance between User Cluster 2 and Item Cluster 3:",round(np.linalg.norm(centroid_users[2]-centroid_items[3]),3))
        print("\n")
        print("Distance between User Cluster 3 and Item Cluster 0:",round(np.linalg.norm(centroid_users[3]-centroid_items[0]),3))
        print("Distance between User Cluster 3 and Item Cluster 1:",round(np.linalg.norm(centroid_users[3]-centroid_items[1]),3))
        print("Distance between User Cluster 3 and Item Cluster 2:",round(np.linalg.norm(centroid_users[3]-centroid_items[2]),3))
        print("Distance between User Cluster 3 and Item Cluster 3:",round(np.linalg.norm(centroid_users[3]-centroid_items[3]),3))

        
dirpath = 'C:/Vidit/PhD/RA Work/KZ - RA/Gaming Project/data/Files'
outputfilename = 'C:/Vidit/PhD/RA Work/KZ - RA/Gaming Project/data/Latent Vector Representation/Gaming_output_metapath_UGIGU_w50_l20.txt'

def main():
    ca = ClusterAnalysis()
    ca.read_data(dirpath)
    ca.create_item_clusters()
    ca.create_user_clusters()
    ca.read_latent_output(outputfilename)
    ca.get_cluster_distances()
    ca.get_k_means_cluster_distances()
    print("\nSuccess")


if __name__ == "__main__":
    main()
            

#items 292
#users 198144
#items with low price (<=$1) 24
#items with mid price (between $1 and $12) 193
#items with high price (>$12) 75
#users with low engagement (less than 5 gaming sessions) 42405
#users with medium engagement (between 5 and 100 gaming sessions) 136449
#users with high engagement (more than 100 gaming sessions) 19290
#Items having vectors: 1450

 Results based on Engagement level of users and Item Prices

Distance between Least Engaged Users and Low Price Items: 3.97
Distance between Least Engaged Users and Medium Price Items: 4.001
Distance between Least Engaged Users and High Price Items: 4.035


Distance between Medium Engaged Users and Low Price Items: 3.961
Distance between Medium Engaged Users and Medium Price Items: 3.979
Distance between Medium Engaged Users and High Price Items: 4.007


Distance between Highly Engaged Users and Low Price Items: 4.05
Distance between Highly Engaged Users and Medium Price Items: 4.037
Distance between Highly Engaged Users and