# Clustering using TFIDF

In [1]:
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import ward, dendrogram, single, complete
import pandas as pd

### Load Data and their titles for the use cases 3 & 4

In [2]:
# with open('C:/Users/anast/Desktop/Thesis/MachineLearning/datasetTitles.txt') as t:
#     titles = t.read().splitlines()
    
# # Use Case 3
# with open("C:/Users/anast/Desktop/Thesis/MachineLearning/Ontology/DatasetOntology/all.txt") as f:
#     data = f.read().splitlines()

# # Use Case 4
# with open("C:/Users/anast/Desktop/Thesis/MachineLearning/Data/datasetProjects.txt") as f:
#     data = f.read().splitlines()  

### Load Data and their titles for the use cases 7 & 8

In [3]:
with open('C:/Users/anast/Desktop/Thesis/MachineLearning/Th-Ur-Titles.txt') as t:
    titles = t.read().splitlines()
    
# # Use Case 7
# with open("C:/Users/anast/Desktop/Thesis/MachineLearning/Ontology/DatasetOntology/Th-Ur-all.txt") as f:
#     data = f.read().splitlines()

# Use Case 8
with open("C:/Users/anast/Desktop/Thesis/MachineLearning/Data/Th-Ur-Projects.txt") as f:
    data = f.read().splitlines()

### Number of clusters

In [4]:
n_clusters = 10

### Preprocessing of data
- Exclude the words of common functioanallity according to the use cases 3,4,7,8
- Clean from numbers, punctuation and stop words
- Lemmatize the words

In [5]:
nlp = spacy.load('en_core_web_lg')

exclude = []
rules = pd.read_csv('C:/Users/anast/Desktop/Results/results-all1.csv')
rules = rules[(rules['Support']>0.2)][['Left Hand Side', 'Right Hand Side']]
exclude.extend(rules['Left Hand Side'].tolist())
exclude.extend(rules['Right Hand Side'].tolist())
exclude = list(dict.fromkeys(exclude))
exclude.extend(['datum', 'administrator', 'log', 'know', 'able', 'ability'])

# Clean the data from numbers, punctuation and stop words and lemmatize
all_docs = []
for line in data:
    doc = nlp(line)
    cleanData = []
    for token in doc:
        if not token.is_alpha:
            continue
        elif token.is_stop:
            continue
        elif token.pos_ == "PUNCT":
            continue
        elif token.text in exclude:
            continue
        elif token.lemma_ in exclude:
            continue 
        else:
            cleanData.append(token.lemma_)
    all_docs.append(" ".join(cleanData))

### Load the testing project (Stereo)

In [6]:
# Test Data 
with open("C:/Users/anast/Desktop/testDataLDA.txt") as f:
    testdata = f.read().splitlines()

### Apply the same preprocessing steps as the training data

In [7]:
# Clean the data from numbers, punctuation and stop words
clean_corpus_test = []
for line in testdata:
    doc = nlp(line)
    cleanData = []
    for token in doc:
        if not token.is_alpha:
            continue
        elif token.is_stop:
            continue
        elif token.pos_ == "PUNCT":
            continue
        elif token.text in exclude:
            continue
        elif token.lemma_ in exclude:
            continue 
        else:
            cleanData.append(token.lemma_)

cleanData = " ".join(cleanData)
all_docs.append(cleanData)
titles.append('Stereo')
cleanData

'email password song playlist personal info profile playlist follow keyword'

### Use the TF-IDF algorithm to vectorize the data

In [8]:
# Tf-Idf Vectorizer
vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000, min_df=0.14,  stop_words='english',
                             use_idf=True, norm=None)
tfidf_matrix = vectorizer.fit_transform(all_docs)
feature_names = vectorizer.get_feature_names()
dense = tfidf_matrix.todense()
denselist = dense.tolist()
print(feature_names)
# print(denselist)
print(tfidf_matrix.shape)
# Similarity
dist = 1 - cosine_similarity(tfidf_matrix)
# print(dist)

['comment', 'list', 'manage', 'message', 'notify', 'order', 'profile', 'rate', 'register', 'send', 'upload']
(101, 11)


### Train a hierarchical clustering model

In [9]:
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = n_clusters, affinity = 'euclidean', linkage = 'ward')
y_hc = hc.fit_predict(dist)

### Organize the results in a data frame

In [10]:
titlesDF = pd.DataFrame(titles, columns = ['Project'])
clusterDF = pd.DataFrame(y_hc, columns = ['Cluster'])
results = pd.concat([titlesDF, clusterDF], axis =1)
# Find which projects belong to the cluster of the last one (testing project)
results[results.Cluster == results.iloc[-1, 1]]

Unnamed: 0,Project,Cluster
1,BuySafe,3
14,ProjectPlay,3
41,Project mob (team 20),3
50,eSoula,3
82,SE 2021 Project - 7,3
88,SE 2021 Project - 14,3
89,SE 2021 Project - 15,3
91,SE 2021 Project - 17,3
94,Rapix,3
100,Stereo,3


### Cluster of the testing project

In [11]:
results[results.Cluster == results.iloc[-1, 1]]['Project']

1                    BuySafe
14               ProjectPlay
41     Project mob (team 20)
50                    eSoula
82       SE 2021 Project - 7
88      SE 2021 Project - 14
89      SE 2021 Project - 15
91      SE 2021 Project - 17
94                     Rapix
100                   Stereo
Name: Project, dtype: object