<a href="https://colab.research.google.com/github/vagabondboffin/topicModeling4UserStories/blob/main/clusteringUS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [1]:
from gensim.models import KeyedVectors
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import gensim.downloader as api
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import collections
import nltk
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords
import pandas as pd # Dataframe for data manipulation
pd.set_option('display.colheader_justify', 'center')
pd.options.display.max_colwidth = 100

import nltk # Natural langage toolkit
nltk.download('punkt', quiet=True)

import numpy as np # Matrix manipulation
import subprocess # to launch bash commands
import re # For Regexs
import gensim.downloader as api # Topic modeling library. Provide pre-trained models (e.g. Word2Vec)
import json # Json manipulation

# Functions

In [2]:

def document_vector(word2vec_model, doc):
    """
    calculer la moyenne des vecteurs des mots pour chaque fonctionnalité
    """
    # supprimer les mots qui ne sont pas dans le vocabulaire du modèle Word2Vec
    doc = [word for word in doc if word in word2vec_model.key_to_index]
    return np.mean(word2vec_model[doc], axis=0)


def getOptimalClusterNumber(features, feature_vectors):
    distortions = []
    indexdistortions = []
    sizeMax = len(features)
    for i in range(1, sizeMax):
        km = KMeans(n_clusters=i, init='k-means++', max_iter=100, n_init=1, verbose=0)
        km.fit(feature_vectors)
        indexdistortions.append((i,km.inertia_))
        distortions.append(km.inertia_)

    optimal_k = 0
    minDistor = 10000000000000
    for i, distor in indexdistortions:
        if (minDistor>distor):
            optimal_k = i
            minDistor = distor
    return(optimal_k)

def clustering(feature_vectors, features, n_clusters=5):
    km = KMeans(n_clusters)
    km.fit(feature_vectors)
    clusterList = []
    # afficher les fonctionnalités dans chaque cluster
    for i in range(n_clusters):
        print("Cluster ", i)
        cluster_features = []
        for j in range(len(features)):
            if km.labels_[j] == i:
                cluster_features.append(" ".join(features[j]))
        clusterList.append(cluster_features)
    return clusterList

def generate_cluster_names(clusters, num_keywords=3):
    """
    Generate cluster names automatically based on the most frequent keywords found in each cluster.
    :param clusters: a list of clusters, where each cluster is a list of documents
    :param num_keywords: the number of keywords to include in each cluster name
    :return: a list of cluster names
    """
    cluster_names = []
    for cluster in clusters:
        # Combine all documents in the cluster into a single string
        cluster_text = ' '.join(cluster)
        # Tokenize the cluster text into words
        words = nltk.word_tokenize(cluster_text)
        # Remove stop words and punctuation
        words = [word.lower() for word in words
                 if len(word) > 1 and word.isalpha() and word.lower() not in stopwords.words('english')]

        # Count the frequency of each word
        word_counts = collections.Counter(words)
        # Select the most frequent keywords
        top_keywords = [keyword for keyword, count in word_counts.most_common(num_keywords)]
        # Generate the cluster name
        cluster_name = ' '.join(top_keywords)
        cluster_names.append((cluster_name, cluster))
    return cluster_names

def loadModel(modelName='word2vec-google-news-300'):
    return api.load(modelName)

def getClusterNamed(model, features, n_clusters=5):

    # vectoriser chaque fonctionnalité
    feature_vectors = [document_vector(model, feature) for feature in features]

    clusterList = clustering(feature_vectors, features, n_clusters)

    return generate_cluster_names(clusterList)



In [3]:
def ctxRelPerCtxForms(df_ctxFormX, data):

    tabRet = []
    for indexData, rowData in data.iterrows():
        tabContains = []

        for indexCtx, rowCtx in df_ctxFormX.iterrows():

            if rowCtx[0] in rowData[0]:
                tabContains.append("x")
            else:
                tabContains.append("")
        tabRet.append(tabContains)
    return tabRet

def clusterFeatures(model,features):

    feature_vectors = [document_vector(model, feature) for feature in features]
    #nbClusters = 6
    nbClusters= getOptimalClusterNumber(features, feature_vectors)

    if nbClusters > (len(features)/4)+2:
        nbClusters = int(len(features)/4)+2
    if nbClusters == 0:
        nbClusters = 1
    return getClusterNamed(model, features, nbClusters)


In [4]:
# Union between two list
def union(lst1, lst2):
    final_list = list(set(lst1) | set(lst2))
    return final_list

# role uniformisation
def uniformRoles(roles):
    roles = list(map(lambda x: x.replace('admin', 'administrator'), roles))
    return roles

# Cleaning role
def cleanRole(sentence):
    sentence = sentence.lower()

    #sentence = sentence.replace("test manager", "manager")
    sentence = sentence.replace("test manager", "tester")
    sentence = sentence.replace("test engineer", "tester")
    sentence = sentence.replace("team member", "tester")
    sentence = sentence.replace("deactivated user", "tester")
    #sentence = sentence.replace("test developer", "developer")
    sentence = sentence.replace("test developer", "tester")

    sentence = sentence.replace("As ", "")
    sentence = sentence.replace("as ", "")
    sentence = sentence.replace("a ", "")
    sentence = sentence.replace("an ", "")
    sentence = sentence.replace("( ", "")
    sentence = sentence.replace(") ", "")
    sentence = sentence.replace("'", "")
    sentence = sentence.replace(",", "")
    sentence = sentence.replace(" ", "")
    return sentence

# Cleaning Feature
def cleanFeature(sentence):
    sentence = nltk.sent_tokenize(sentence)
    sentence = ' '.join(sentence)
    sentence = sentence.replace("want", "")
    sentence = sentence.replace("'", "")
    sentence = sentence.replace(",", "")
    sentence = sentence.replace("(", " ")
    sentence = sentence.replace(")", " ")
    sentence = sentence.replace("  ", " ")
    stop_words = set(stopwords.words('english'))
    words = sentence.split()
    sentence = [word for word in words if word not in stop_words]
    sentence = " ".join(sentence)
    return sentence[1:]

# Preprocessing

In [None]:
df_complet = pd.read_csv("oneLevelUS4Article.csv").drop("index", axis=1)
df_complet['US_title'] = df_complet['US_title'].str.replace(r', so I.*', '', regex=True)
#df_complet['US_title'] = df_complet['US_title'].apply(stem_sentence)
df_complet.head()

In [None]:
df_US = pd.DataFrame(data={"US": df_complet["US_title"]})
df_US["US"] = df_US["US"].apply(lambda x: x.replace(",", ""))
df_US.to_csv("CSVs/ctxF_" + "userStories" + "_Example.csv")
df_US

featureData = []
for us in df_complet["US_title"]:
    usSplit = us.split("I want")
    featureData.append(cleanFeature(("JJJ" + "".join(usSplit[1:]))[2:]))
df_features = pd.DataFrame(data={"features": featureData})
df_features

In [None]:
from gensim.models import KeyedVectors
#model = KeyedVectors.load("/content/LM.kv").wv
model = api.load('glove-wiki-gigaword-200')
# model = api.load('glove-wiki-gigaword-300')

In [None]:


# word_counts = df_features['features'].str.split(expand=True).stack().value_counts()
# words_to_remove = word_counts[word_counts > 20].index.tolist()
# print(words_to_remove)
# df_features['features'] = df_features['features'].apply(lambda row: ' '.join([word for word in row.split() if word not in words_to_remove]))
features2Roles = my_array = np.array(ctxRelPerCtxForms(df_roles, df_rolesPerUS))
df_features2Roles = pd.DataFrame(my_array, columns=df_roles["roles"], index=df_features["features"])
#df_features2Roles


In [None]:
df_features2Roles['roleCluster'] = df_features2Roles.apply(lambda row: min([col for col, val in row.items() if val == 'x'], key=len) if 'x' in row.values else '', axis=1)

featuresRoles = []
for role in df_roles["roles"].tolist():
    feat = df_features2Roles[df_features2Roles['roleCluster'] == role].index.tolist()
    if(len(feat)>0):
        featuresRoles.append(list(map(lambda x: x.split(" "), feat)))
featuresRoles

featuresPerClusters = []
for featuresPerRoles in featuresRoles:
    featuresPerClusters.append(clusterFeatures(model, featuresPerRoles))

Cluster  0
Cluster  1
Cluster  2
Cluster  3
Cluster  4
Cluster  5


