# Trying to use GenSim for our project.

Mostly borrowed from https://radimrehurek.com/gensim/auto_examples/core/run_topics_and_transformations.html



In [78]:
# Cell 1

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Mounting Google drive (only need to run this the first time you execute code in this notebook)
from google.colab import drive
drive.mount('/content/drive')



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


* build a list of tokenized strings, one for each document.

In [79]:
# Cell 2

import os
# set project_folder to the path where the documents are located.
from pathlib import Path
project_folder = Path("/content/drive")/"MyDrive"/"562 Project"
articles = project_folder/"Raw Articles"/"no_linebreaks"

# Make a list of document strings (strings that each contain a whole document)
documents = []
for file in articles.glob("*"):
  with open(file,'r') as f:
    documents.append(f.readline())



text should now be a list of 85 documents each as a single string.

In [80]:
# Cell 3

print(f"Length of text is {len(documents)}")

Length of text is 81


This is what the first one looks like:

In [81]:
# Cell 4
print(documents[30])

Received 27 July 2022, accepted 18 August 2022, date of publication 25 August 2022, date of current version 1 September 2022. Digital Object Identifier 10.1109/ACCESS.2022.3201516 Towards a Smart Elevator-Aided Fire Evacuation Scheme in High-Rise Apartment Buildings for Elderly HONGQIANG FANG1, HONGPENG QIU2, PENG LIN2, S. M. LO1, AND J. T. Y. LO3 1 Department of Architecture and Civil Engineering, City University of Hong Kong, Hong Kong, China 2 Department of Fire Safety Engineering, Southwest Jiaotong University, Chengdu 611756, China 3 Department of Civil and Environmental Engineering, Hong Kong Polytechnic University, Hong Kong, China Corresponding author: Hongqiang Fang (hqfang3-c@my.cityu.edu.hk) This work was supported by the General Research Fund of Research Grant Council, Hong Kong Special Administrative Region, China, under Grant CityU 11216920. ABSTRACT Staircase evacuation is the major means of fire evacuation for current high-rise residential buildings. However, its feasib

# Naive approach

* string splits on white-space -> 64,211 tokens.

In [None]:
# Cell 5a

from collections import defaultdict
from gensim import corpora

# remove common words and tokenize
stoplist = set ('for a of the and to in'.split())
texts = [
    [word for word in document.lower().split() if word not in stoplist]
    for document in documents
]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]


# Alternate approach

* Using gensim.utils.tokenize -> 27,616 tokens.

In [None]:
# Cell 5b

from collections import defaultdict
from gensim import corpora, utils

stoplist = set ('for a of the and to in'.split())

texts = []
for document in documents:
  texts.append(list(utils.tokenize(document.replace("- ",""), lowercase=True, deacc=True)))

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Alternate 2, stemmer, then tokenizer


In [None]:
# Cell 5c

from collections import defaultdict
from gensim import corpora, utils
from gensim.parsing.preprocessing import PorterStemmer, remove_stopwords

stemmer = PorterStemmer()


stoplist = set ('for a of the and to in'.split())

#texts = []
#for document in documents:
#  texts.append(list(utils.tokenize(document.replace("- ",""), lowercase=True, deacc=True)))

stemmed_texts = []
for document in documents:
  dhdocument = document.replace("- ","") # un-hyphenating the hyphenated words
  stemmed_text = " ".join([stemmer.stem(token) for token in utils.tokenize(dhdocument)])
  stemmed_texts.append(list(utils.tokenize(stemmed_text, lowercase=True, deacc=True)))

dictionary = corpora.Dictionary(stemmed_texts)
corpus = [dictionary.doc2bow(text) for text in stemmed_texts]

Only print the dictionary if you need to, there are 64,211 or 27,616 entries!
* Naive approach - much duplication, for example, "victoria:" and "victoria;" are treated as two different tokens.<br/>
* Gensim.utils.tokenize produces a better tokenization.<br/>
* de-hyphenating the tokenized  brings it down to 26,424
* Using the Porter Stemmer reduces it down to 19,575


In [None]:
# Cell 6

print(dictionary)

for key in dictionary.keys():
  print(f"{key} = {dictionary[key]}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
14576 = sharifi
14577 = shou
14578 = signorelli
14579 = siliquini
14580 = sinha
14581 = skarabela
14582 = slovenia
14583 = slovic
14584 = smithson
14585 = snell
14586 = soh
14587 = solicit
14588 = solid
14589 = spengler
14590 = spitzer
14591 = sproston
14592 = std
14593 = stearn
14594 = stionnaire_design_and_surveys_sampl
14595 = stocker
14596 = stubber
14597 = subscal
14598 = sunshin
14599 = suspect
14600 = swan
14601 = tabassum
14602 = tabatabaei
14603 = taherdoost
14604 = tanab
14605 = tardiff
14606 = tatham
14607 = tehran
14608 = temzelid
14609 = tenur
14610 = theorem
14611 = thomopoulo
14612 = tobacco
14613 = tost
14614 = trailblaz
14615 = trillion
14616 = tsarouha
14617 = tulsa
14618 = turk
14619 = turkish
14620 = turlei
14621 = tuulio
14622 = tweng
14623 = twoitem
14624 = tyne
14625 = tzortzopoulo
14626 = ugido
14627 = unhealthi
14628 = urbanrur
14629 = usefulli
14630 = vau
14631 = vid
14632 = vlahov
14633 = voglin

In [None]:
# OPTIONAL CELL

# save the dictionary to a pickle file

import pickle
filename = project_folder.joinpath('saved_dictionary.pkl')
with open(filename, 'wb') as f:
  pickle.dump(dictionary, f) # Pickle and save


In [None]:
for key in dictionary.keys():
  data_string = f"{key} : {dictionary[key]}"
  print(data_string)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
21410 : ر
21411 : را
21412 : رات
21413 : راض
21414 : ران
21415 : رة
21416 : رت
21417 : رج
21418 : رد
21419 : رض
21420 : رك
21421 : رو
21422 : ز
21423 : ص
21424 : ض
21425 : ط
21426 : ظ
21427 : ع
21428 : ف
21429 : ك
21430 : ل
21431 : م
21432 : ن
21433 : ه
21434 : و
21435 : وا
21436 : وج
21437 : وط
21438 : ول
21439 : وھ
21440 : ى
21441 : ھ
21442 : _con
21443 : a_con
21444 : abolfazl
21445 : aggregations
21446 : ahp
21447 : aifl
21448 : aift
21449 : ais
21450 : alfonzo
21451 : antecedents
21452 : antucheviciene
21453 : apc
21454 : assilian
21455 : axelson
21456 : b_con
21457 : belongs
21458 : bership
21459 : beynon
21460 : bypasses
21461 : c_con
21462 : cambers
21463 : cartogr
21464 : cased
21465 : chavoshi
21466 : cirris
21467 : coayla
21468 : complexities
21469 : confidences
21470 : connexion
21471 : controller
21472 : crisp
21473 : curbcut
21474 : d_con
21475 : dechter
21476 : decicion
21477 : defuzzification
21478 : deist

In [None]:
# OPTIONAL CELL

# save the dictionary as a text file

filename = project_folder.joinpath('saved_dictionary.txt')
with open(filename, 'w') as f:
  for key in dictionary.keys():
    data_string = f"{key} : {dictionary[key]}\n"
    f.write(data_string)
  f.close()

In [None]:
# OPTIONAL CELL

# save the dictionary as a csv file

filename = project_folder.joinpath('saved_dictionary.csv')
with open(filename, 'w') as f:
  for key in dictionary.keys():
    data_string = f"{key},{dictionary[key]}\n"
    f.write(data_string)
  f.close()

In [None]:
# OPTIONAL CELL

print(corpus[15])
print(texts[15])

[(1, 148), (4, 7), (5, 14), (9, 6), (10, 11), (11, 11), (17, 2), (18, 1), (20, 2), (21, 2), (23, 1), (35, 1), (36, 1), (38, 1), (39, 3), (40, 1), (42, 2), (45, 7), (47, 5), (52, 5), (53, 4), (60, 38), (61, 8), (64, 1), (65, 175), (68, 2), (71, 4), (73, 1), (77, 1), (78, 2), (79, 3), (85, 40), (86, 2), (88, 10), (92, 45), (93, 1), (94, 1), (95, 16), (96, 1), (97, 2), (98, 3), (99, 1), (103, 6), (108, 2), (112, 2), (115, 2), (116, 5), (120, 3), (122, 9), (124, 19), (127, 2), (129, 2), (131, 2), (132, 2), (133, 5), (139, 8), (140, 1), (141, 87), (145, 4), (151, 1), (157, 1), (158, 2), (159, 14), (160, 30), (162, 8), (163, 4), (166, 20), (171, 4), (174, 4), (176, 4), (177, 1), (184, 5), (190, 10), (193, 2), (199, 6), (200, 45), (207, 1), (209, 1), (210, 6), (216, 2), (218, 3), (219, 1), (220, 1), (222, 2), (223, 2), (225, 3), (226, 3), (229, 2), (230, 3), (231, 1), (232, 3), (233, 2), (235, 2), (245, 1), (248, 2), (256, 3), (267, 1), (272, 5), (273, 5), (274, 2), (276, 1), (289, 1), (290, 

In [None]:
# OPTIONAL CELL

# Lists the entries in the corpus

for i in range(len(corpus)):
  print(f'Document #{i}, {len(corpus[i])} entries, {corpus[i]}')

Document #0, 1133 entries, [(0, 5), (1, 132), (2, 2), (3, 1), (4, 3), (5, 3), (6, 5), (7, 2), (8, 58), (9, 2), (10, 1), (11, 1), (12, 2), (13, 9), (14, 3), (15, 2), (16, 1), (17, 2), (18, 3), (19, 3), (20, 4), (21, 2), (22, 1), (23, 4), (24, 3), (25, 3), (26, 2), (27, 1), (28, 1), (29, 2), (30, 3), (31, 1), (32, 2), (33, 1), (34, 15), (35, 1), (36, 3), (37, 2), (38, 20), (39, 1), (40, 2), (41, 1), (42, 1), (43, 2), (44, 9), (45, 5), (46, 4), (47, 1), (48, 1), (49, 23), (50, 7), (51, 3), (52, 99), (53, 3), (54, 2), (55, 2), (56, 1), (57, 1), (58, 2), (59, 1), (60, 1), (61, 1), (62, 1), (63, 1), (64, 2), (65, 5), (66, 1), (67, 1), (68, 1), (69, 14), (70, 2), (71, 1), (72, 8), (73, 1), (74, 1), (75, 1), (76, 1), (77, 35), (78, 1), (79, 7), (80, 5), (81, 2), (82, 11), (83, 1), (84, 7), (85, 1), (86, 2), (87, 2), (88, 3), (89, 2), (90, 1), (91, 15), (92, 1), (93, 1), (94, 2), (95, 7), (96, 1), (97, 2), (98, 1), (99, 91), (100, 12), (101, 1), (102, 26), (103, 2), (104, 1), (105, 16), (106, 2

In [None]:
# Cell 7

import random
document_number = random.randint(0, len(corpus) - 1)
word_number = random.randint(0,len(corpus[document_number]) - 1)

print(len(corpus)) # There are 85 documents in the corpus.
print(corpus[document_number]) # Display the Bag Of Words for the selected document. (key,count)
#print(f"The word {dictionary[corpus[document_number]]} appears in document number {document_number} {word_number} times.") # Retrieve a word by its key in the dictionary.
print(f'The word "{dictionary[corpus[document_number][word_number][0]]}" is used {corpus[document_number][word_number][1]} times in document number {document_number}.')

81
[(1, 163), (4, 2), (5, 1), (7, 1), (8, 4), (12, 4), (13, 2), (14, 12), (17, 1), (18, 2), (19, 2), (20, 1), (23, 1), (24, 1), (25, 28), (28, 4), (29, 4), (30, 1), (34, 6), (36, 3), (38, 25), (40, 4), (42, 4), (43, 10), (44, 11), (46, 3), (49, 33), (50, 13), (51, 2), (52, 389), (53, 5), (55, 4), (56, 2), (59, 1), (61, 3), (63, 4), (64, 2), (65, 7), (66, 2), (67, 1), (69, 22), (72, 4), (74, 6), (77, 72), (78, 11), (79, 2), (80, 17), (86, 3), (87, 3), (88, 13), (91, 5), (92, 1), (95, 17), (99, 1), (100, 14), (102, 28), (105, 4), (106, 8), (107, 3), (113, 4), (114, 2), (115, 14), (116, 25), (117, 1), (118, 25), (122, 12), (136, 12), (137, 25), (139, 19), (143, 17), (145, 4), (146, 5), (148, 4), (152, 4), (156, 2), (158, 6), (159, 4), (161, 7), (163, 13), (165, 1), (166, 1), (171, 3), (173, 3), (175, 30), (177, 6), (179, 2), (181, 4), (182, 1), (183, 2), (185, 13), (186, 3), (187, 1), (188, 1), (189, 30), (190, 1), (192, 1), (193, 8), (194, 6), (196, 2), (197, 1), (198, 8), (199, 3), (200

# Creating a transformation

In [None]:
# Cell 8

from gensim import models

tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model


TFDIF is treated as a read only object that can be used to convert any vector from the old representation (bag of words integer counts) to the new representation (tf-idf real-valued weights)

In [None]:
# Cell 9

doc_bow = [(0,1), (1,1)]
print(tfidf[doc_bow]) # step 2 -- use the model to transform vectors

[(0, 1.0)]


or to apply transformation to a whole corpus

In [None]:
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
  print(doc)

Output hidden; open in https://colab.research.google.com to view.

Here are some ways to look at things:
serialized transformations

In [None]:
# Cell 10

lsi_model = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2) # initialize an LSI transformation
corpus_lsi = lsi_model[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfdif->fold-in-lsi

Here we transform our tf-idf corpus via Latent Semantic Indexing into a latent 2-d space (num-topics=2)

In [None]:
# Cell 11

lsi_model.print_topics(2)

[(0,
  '0.200*"rout" + 0.152*"pwd" + 0.138*"crossref" + 0.118*"detect" + 0.112*"blind" + 0.096*"navig" + 0.084*"fig" + 0.081*"pedestrian" + 0.080*"lane" + 0.079*"comput"'),
 (1,
  '-0.538*"pwd" + 0.149*"rout" + 0.129*"detect" + -0.124*"plwd" + -0.123*"dementia" + 0.119*"rp" + -0.113*"malaysia" + 0.105*"lane" + -0.098*"cent" + 0.088*"poi"')]

Skipping down some to model persistency:

In [None]:
# Cell 12

import tempfile

with tempfile.NamedTemporaryFile(prefix='model-', suffix='.lsi', delete=False) as tmp:
  lsi_model.save(tmp.name)

loaded_lsi_model = models.LsiModel.load(tmp.name)

os.unlink(tmp.name)

