# LDA models for different topics

In this notebook, LDA models are trained on the corresponding input that every use case indicates, with number of topics indicated by the best performing models from the lda_select_model.ipynb. (based on the topic coherence score). Then the testing project is inserted to the trained LDA models and is assigned to the topics, so we can find its dominant one. Following that, we present the other documents with the same dominant topic for each model. In the best performing LDA models, the testing project should belong to a team with projects with the same functionality (Mob, Spotify) and the team should not have many other projects. 

In [1]:
import re, numpy as np, pandas as pd
from pprint import pprint
# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
import codecs
import os

warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

### Load Data and their titles for the use cases 1 - 4

In [2]:
with open('C:/Users/anast/Desktop/Thesis/MachineLearning/datasetTitles.txt') as t:
    titles = t.read().splitlines()

# Use cases 1 & 3
with open("C:/Users/anast/Desktop/Thesis/MachineLearning/Ontology/DatasetOntology/all.txt") as f:
    data = f.read().splitlines()

# Use cases 2 & 4
# with open("C:/Users/anast/Desktop/Thesis/MachineLearning/Data/datasetProjects.txt") as f:
#     data = f.read().splitlines()

### Load Data and their titles for the use cases 5 - 8

In [3]:
# with open('C:/Users/anast/Desktop/Thesis/MachineLearning/Th-Ur-Titles.txt') as t:
#     titles = t.read().splitlines()

# # Use Cases 5 & 7
# with open("C:/Users/anast/Desktop/Thesis/MachineLearning/Ontology/DatasetOntology/Th-Ur-all.txt") as f:
#     data = f.read().splitlines()

# # Use Cases 6 & 8
# with open("C:/Users/anast/Desktop/Thesis/MachineLearning/Data/Th-Ur-Projects.txt") as f:
#     data = f.read().splitlines()

### Preprocessing of data
- Exclude the words of common functioanallity according to the use cases 3,4,7,8
- Clean from numbers, punctuation and stop words
- Lemmatize the words

In [4]:
outputPath = "C:/Users/anast/Desktop/Thesis/LDA/ResultsAll/"

nlp = spacy.load('en_core_web_lg')

exclude = []
rules = pd.read_csv('C:/Users/anast/Desktop/Results/results-all1.csv')
rules = rules[(rules['Support']>0.2)][['Left Hand Side', 'Right Hand Side']]
exclude.extend(rules['Left Hand Side'].tolist())
exclude.extend(rules['Right Hand Side'].tolist())
exclude = list(dict.fromkeys(exclude))
exclude.extend(['datum', 'administrator', 'log', 'know', 'able', 'ability'])
# exclude = []
exclude.extend(['able', 'ability'])
# exclude = []

# Clean the data from numbers, punctuation and stop words
clean_corpus = []
for line in data:
    doc = nlp(line)
    cleanData = []
    for token in doc:
        if not token.is_alpha:
            continue
        elif token.is_stop:
            continue
        elif token.pos_ == "PUNCT":
            continue
        elif token.text in exclude:
            continue
        elif token.lemma_ in exclude:
            continue 
        else:
            cleanData.append(token.lemma_)
    clean_corpus.append(cleanData)

### Build the bigram and trigram models

In [5]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(clean_corpus, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[clean_corpus], threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

texts = [bigram_mod[doc] for doc in clean_corpus]
texts = [trigram_mod[bigram_mod[doc]] for doc in texts]

data_ready = texts

In [6]:
corpus = []

In [7]:
# Functions
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

### Create LDA models for different number of topics

In [8]:
topics_iter = [14,3,7,4,6,19,2,18,10,8]
topics_iter.sort()

In [9]:
for num_topics in topics_iter:

  # Create Dictionary
  id2word = corpora.Dictionary(data_ready)

  # Create Corpus: Term Document Frequency
  corpus = [id2word.doc2bow(text) for text in data_ready]

  # Build LDA model
  lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=num_topics,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=10,
                                            passes=10,
                                            alpha='symmetric',
                                            iterations=100,
                                            per_word_topics=True)
  if not os.path.exists(f'{outputPath}{num_topics}/'):      
            os.makedirs(f'{outputPath}{num_topics}/')
  lda_model.save(f"{outputPath}{num_topics}/ldamodel")

  #################################
  df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_ready)
  # Format
  df_dominant_topic = df_topic_sents_keywords.reset_index()
  df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
  
  df_dominant_topic.to_csv(f"{outputPath}{num_topics}/dominant_topics.csv", index = None)



  ################################
  # Display setting to show more characters in column
  pd.options.display.max_colwidth = 100

  sent_topics_sorteddf_mallet = pd.DataFrame()
  sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

  for i, grp in sent_topics_outdf_grpd:
      sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet,
                                              grp.sort_values(['Perc_Contribution'], ascending=False).head(1)],
                                              axis=0)

  # Reset Index
  sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

  # Format
  sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]

  # Show
  sent_topics_sorteddf_mallet.head(10)


  #%%

  doc_lens = [len(d) for d in df_dominant_topic.Text]

### Load, preprocess and get the bigram/trigram model of the testing project

In [10]:
# Test Data 
with open("C:/Users/anast/Desktop/testDataLDA.txt") as f:
    testdata = f.read().splitlines()

# Clean the data from numbers, punctuation and stop words
clean_corpus_test = []
for line in testdata:
    doc = nlp(line)
    cleanData = []
    for token in doc:
        if not token.is_alpha:
            continue
        elif token.is_stop:
            continue
        elif token.pos_ == "PUNCT":
            continue
        elif token.text in exclude:
            continue
        elif token.lemma_ in exclude:
            continue 
        else:
            cleanData.append(token.lemma_)
    clean_corpus_test.append(cleanData)

bigramTest = gensim.models.Phrases(clean_corpus_test, min_count=5, threshold=100) # higher threshold fewer phrases.
trigramTest = gensim.models.Phrases(bigramTest[clean_corpus_test], threshold=100)
bigram_modTest = gensim.models.phrases.Phraser(bigramTest)
trigram_modTest = gensim.models.phrases.Phraser(trigramTest)

textsTest = [bigram_modTest[doc] for doc in clean_corpus_test]
textsTest = [trigram_modTest[bigram_mod[doc]] for doc in textsTest]

dataTest = textsTest

### Find which is the dominant topic for the testing project, for every LDA model

In [11]:
for num_topics in topics_iter:
  lda_model_loaded = gensim.models.ldamodel.LdaModel.load(f"{outputPath}{num_topics}/ldamodel")
  id2word_loades = lda_model_loaded.id2word
  # Create Corpus: Term Document Frequency
  test_corpus = [id2word_loades.doc2bow(text) for text in dataTest]
  topic_vec = lda_model_loaded[test_corpus]

  # # For test 
  df_topic_sents_keywords_test = format_topics_sentences(ldamodel=lda_model_loaded, corpus=test_corpus, texts=dataTest)

  # Format
  df_dominant_topic = df_topic_sents_keywords_test.reset_index()
  df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
  df_dominant_topic.to_csv(f"{outputPath}{num_topics}/test_topic.csv", index = None)

### Format the results

In [12]:
result = pd.DataFrame()
for num_topics in topics_iter:
  dato = pd.read_csv(f"{outputPath}{num_topics}/test_topic.csv")
  result = result.append(dato)
result['topics'] = topics_iter
result.to_csv(f"{outputPath}testResults.csv", index = None)
result

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text,topics
0,0,0.0,0.6071,"manager, repository, archivist, training, file, upload, detect, info, alfred, video","['email', 'password', 'song', 'playlist', 'personal', 'info', 'profile', 'playlist', 'follow', '...",2
0,0,0.0,0.8854,"product, notify, cancel, list, request, store, profile, personal, rate, training","['email', 'password', 'song', 'playlist', 'personal', 'info', 'profile', 'playlist', 'follow', '...",3
0,0,3.0,0.4722,"member, product, staff, profile, store, list, event, choose, review, send","['email', 'password', 'song', 'playlist', 'personal', 'info', 'profile', 'playlist', 'follow', '...",4
0,0,1.0,0.47,"file, team, personal, upload, test, exchange, coach, athlete, cancel, allow","['email', 'password', 'song', 'playlist', 'personal', 'info', 'profile', 'playlist', 'follow', '...",6
0,0,1.0,0.4677,"file, team, personal, archivist, test, exchange, upload, athlete, coach, allow","['email', 'password', 'song', 'playlist', 'personal', 'info', 'profile', 'playlist', 'follow', '...",7
0,0,3.0,0.3653,"product, staff, notify, cancel, event, send, report, review, request, store","['email', 'password', 'song', 'playlist', 'personal', 'info', 'profile', 'playlist', 'follow', '...",8
0,0,2.0,0.3644,"manager, post, set, book, comment, profile, report, list, rate, country","['email', 'password', 'song', 'playlist', 'personal', 'info', 'profile', 'playlist', 'follow', '...",10
0,0,8.0,0.2792,"cancel, detect, info, alfred, book, olderperson, message, salon, device, smart","['email', 'password', 'song', 'playlist', 'personal', 'info', 'profile', 'playlist', 'follow', '...",14
0,0,15.0,0.8111,"notify, set, profile, list, team, personal, page, manage, rate, report","['email', 'password', 'song', 'playlist', 'personal', 'info', 'profile', 'playlist', 'follow', '...",18
0,0,15.0,0.8105,"notify, list, set, profile, team, personal, page, rate, report, manage","['email', 'password', 'song', 'playlist', 'personal', 'info', 'profile', 'playlist', 'follow', '...",19


### Find which other documents have the same dominant topic as the testing project, for every LDA model 

In [13]:
for index, num_topics in enumerate(topics_iter):
  dominant_topics = pd.read_csv(f"{outputPath}{num_topics}/dominant_topics.csv", header = 0)
  print("---------- Topics: " + str(num_topics) + " ----------")
  topic = result.iloc[index, 1]
  print("Test Data Topic: " + str(int(topic)))
  for j in range(dominant_topics.__len__()):
    if topic == int(dominant_topics.iloc[j, 1]):
      print(str(j) + ": " + titles[j]) 
  print("\n")

---------- Topics: 2 ----------
Test Data Topic: 0
2: Car Gossip
5: GiftCase
6: HelpMe
7: HotSpotManagement
10: ProjectLibrary
11: ProjectMedical
20: ShrimpShip
24: Software Patterns
25: Taraxacum
28: WikiPres
41: g16-mis
43: g18-neurohub
44: g19-alfred
45: g21-badcamp
46: g22-rdadmp
47: g23-archivesspace
48: g24-unibath
49: g25-duraspace
50: g26-racdam
52: g28-zooniverse
54: RSE2020-THMMY
56: RSE2020-Youtube
57: RSE2020-Youtube2
58: RSE2020-Shazam
61: RSE2020-Twitter
62: RSE2020-Dropbox
63: DiabetesProject
64: Project mob (team 20)
69: Healthicle
71: Shotgun!
73: eSoula
80: Google Translate
83: Stixoiman
84: YouTube
85: Pinterest
87: Tsagidi project (SPOTIFY APP)
90: Tsagidi Project NodeRed
91: Tsolakis Project Nodered
92: Mikrouli project Nodered
93: Gionavis project Nodered
94: Patsika project Nodered
95: Nastos project NodeRed
96: Kalpakidis project Nodered
97: Alexiou project Nodered
100: SE 2021 Project - 2
101: LSP Management
115: Book Warehouse
117: Rapix
122: Safe Swim


-----