In [None]:
#connecting to google drive
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
%cd path
!ls

#### **Text Cleaning**

In [None]:
#cleaning the text and returning a list of tokens
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL') #removing any links in the text
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_) #converting to lowercase
    return lda_tokens

In [None]:
#Lemmatization
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#removal of stopwords
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [None]:
#Preparing data for LDA
import random
text_data = []
with open('Daughter of Damascus _ Alexander Street, a ProQuest Company.txt') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['moment', 'honest', 'include', 'touch', 'reveal']
['siham', 'purpose', 'writing', 'wholly']
['schooling', 'receive', 'training', 'telecommunication', 'allow']
['syrian', 'revolution']
['staris', 'casting', 'aspersion', 'daughter', 'painstakingly']
['responsible', 'entertain', 'others', 'indulge']
['boldiness', 'important', 'characteristic']
['siham', 'venture', 'try', 'authentic', 'picture', 'possible']
[]
['common', 'soldier', 'perspective', 'siham', 'father', 'relate', 'defeat']
['build', 'house', 'conform', 'bodily', 'spiritual', 'need']
['siory', 'change', 'repeat', 'middle']
[]
['damascus']
['whore', 'vegetable']
['lover', 'become', 'wonderful']
['cling', 'genuine', 'order', 'change']
['poking', 'boil', 'ihways']
['cotton', 'lady', 'bizuriyya']
['round', 'third', 'round', 'whole', 'course', 'complete']
['semorat']
['answer', 'leave', 'bark', 'usclessly']
['along', 'outside', 'angry', 'sister', 'really']
[]
[]
[]
['provide', 'amusement', 'others', 'would', 'waich', 'group', 'chile

We create a dictionary from the data and then convert it to a Bag-of-word corpus and then save the dictionary and corpus for future use.

**What is a Bag-of-Word model?**                                        
In this model, a text (such as a sentence or a document) is represented as the bag (multiset) of its words, disregarding grammar and even word order but keeping multiplicity. 

ex :-                                                                   
John likes to watch movies. Mary likes movies too.                       
BoW1 = {"John":1,"likes":2,"to":1,"watch":1,"movies":2,"Mary":1,"too":1};

In [None]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
#LDA to find 5 topics in the data

import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.025*"would" + 0.017*"consider" + 0.017*"damascus" + 0.009*"sleep"')
(1, '0.015*"young" + 0.015*"white" + 0.015*"siham" + 0.015*"others"')
(2, '0.018*"clothes" + 0.018*"mother" + 0.010*"would" + 0.010*"become"')
(3, '0.017*"large" + 0.009*"sweet" + 0.009*"hands" + 0.009*"ofthe"')
(4, '0.024*"woman" + 0.016*"wedding" + 0.016*"doctor" + 0.016*"without"')


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
#LDA to a new document
with open('Fantasia_ An Algerian Cavalcade _ Alexander Street, a ProQuest Company.txt','r') as f:
  new_doc = f.read().replace('\n', '')
  new_doc = prepare_text_for_lda(new_doc)
  new_doc_bow = dictionary.doc2bow(new_doc)
  print(new_doc_bow)
  print(ldamodel.get_document_topics(new_doc_bow))

[(1, 14), (2, 25), (3, 6), (4, 17), (5, 1), (8, 35), (9, 11), (10, 21), (11, 1), (13, 1), (14, 7), (17, 1), (18, 55), (22, 5), (23, 26), (24, 1), (26, 1), (27, 6), (28, 1), (29, 7), (30, 13), (31, 18), (32, 8), (33, 3), (34, 6), (35, 93), (37, 8), (38, 52), (40, 6), (41, 1), (42, 70), (43, 7), (44, 1), (45, 12), (46, 21), (47, 13), (49, 1), (50, 2), (52, 35), (53, 8), (55, 6), (57, 25), (62, 1), (63, 11), (64, 5), (65, 12), (66, 24), (67, 12), (68, 22), (70, 5), (72, 38), (74, 23), (75, 1), (76, 16), (77, 8), (78, 54), (81, 15), (82, 4), (84, 88), (85, 72), (86, 84), (87, 1), (88, 27), (90, 13), (91, 6), (92, 72), (93, 4), (94, 2), (95, 64), (97, 31), (98, 1), (100, 6), (101, 4), (102, 6), (103, 6), (105, 17), (106, 23), (107, 12), (108, 14), (109, 7), (110, 103), (113, 38), (114, 3), (115, 17), (116, 22), (120, 43), (121, 2), (124, 14), (126, 2), (127, 10), (128, 17), (129, 11), (131, 5), (132, 1), (133, 3), (134, 5), (135, 10), (136, 19), (138, 6), (140, 16), (141, 110), (143, 7), (1

In [None]:
#LDA to find 3 topics in the data
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.014*"sweet" + 0.010*"wedding" + 0.010*"doctor" + 0.009*"siham"')
(1, '0.020*"would" + 0.011*"woman" + 0.011*"young" + 0.010*"white"')
(2, '0.012*"round" + 0.012*"soldier" + 0.012*"break" + 0.007*"whole"')


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.021*"order" + 0.021*"change" + 0.021*"soldier" + 0.021*"officer"')
(1, '0.040*"clothes" + 0.021*"daughter" + 0.021*"consider" + 0.021*"provider"')
(2, '0.038*"woman" + 0.026*"ofthe" + 0.026*"young" + 0.013*"bride"')
(3, '0.021*"break" + 0.021*"house" + 0.021*"answer" + 0.021*"antar"')
(4, '0.030*"mother" + 0.016*"office" + 0.016*"pasha" + 0.016*"bodily"')
(5, '0.048*"would" + 0.020*"place" + 0.020*"become" + 0.020*"others"')
(6, '0.038*"wedding" + 0.038*"without" + 0.020*"sweet" + 0.020*"mafry"')
(7, '0.033*"enter" + 0.033*"large" + 0.017*"hands" + 0.017*"doctor"')
(8, '0.041*"white" + 0.028*"sister" + 0.015*"start" + 0.015*"lettuce"')
(9, '0.038*"damascus" + 0.020*"doctor" + 0.020*"siham" + 0.020*"chadija"')


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
!pip install pyldavis



In [None]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
