<h2>Main dataframe

In [1]:
import pandas as pd
import numpy as np
import ast
import tqdm
tqdm.tqdm.pandas()
quran = pd.read_csv(r"CSVs_&_other_files\quran_with_lemma.csv")
quran['lemmatized'] = quran.lemmatized.progress_map(lambda x : ast.literal_eval(x))
quran = quran[~quran.text.str.contains('بسم الله')]

100%|██████████| 6345/6345 [00:00<00:00, 19884.69it/s]


In [2]:
quran

Unnamed: 0,text,sura,lemmatized
1,الحمد لله رب العٰلمين,سورة الفاتحة,"[حمد, الله, رب, علم]"
2,الرحمٰن الرحيم,سورة الفاتحة,"[رحمن, رحيم]"
3,مٰلك يوم الدين,سورة الفاتحة,"[ملك, يوم, دين]"
4,اياك نعبد واياك نستعين,سورة الفاتحة,"[اياك, عبد, اياك, استعان]"
5,اهدنا الصرٰط المستقيم,سورة الفاتحة,"[اهد, صراط, مستقيم]"
...,...,...,...
6340,ملك الناس,سورة الناس,"[ملك, ناس]"
6341,الٰه الناس,سورة الناس,"[اله, ناس]"
6342,من شر الوسواس الخناس,سورة الناس,"[من, شر, وسواس, خناس]"
6343,الذي يوسوس في صدور الناس,سورة الناس,"[الذي, يوسوس, في, صدور, ناس]"


<h5>Necessary libraries

In [3]:
# For word2vec & topic modeling...
import gensim
# For visualization
import plotly.express as px
# Arabic stopwords
import nltk
import arabicstopwords.arabicstopwords as stp
# More stopwords
stopwords_2 = pd.read_csv(r"CSVs_&_other_files\stopwords.csv", sep=';').rename(columns={r"#الكلمة":'words'})['words'].tolist()
arb_stopwords = set(nltk.corpus.stopwords.words("arabic") + list(stp.stopwords_list()) + stopwords_2 + ['ان','الا','اذ','وان','الى','او','انا','ام','الى','الي'])

In [4]:
quran_texts = quran['lemmatized'].tolist()

<h2>EDA - Exploratory Data Analysis

<h4>All the quran vocabulary without stopwords

In [5]:
all_vocab = []
for text in quran_texts:
    all_vocab.append([term for term in text if term not in arb_stopwords])

<h5>The most used words in the quran

In [6]:
words_count = pd.DataFrame(all_vocab).rename(columns={0:'words'}).reset_index().groupby('words').count().sort_values('index', ascending=False).reset_index().rename(columns={'index':'count'}).head(10)
fig = px.bar(words_count, x='count', y='words', orientation='h', title=f'The top {len(words_count)} words in terms of occurrences', height=600)
fig.show()

<h5>The longest Sura's (Number of words by each sura)

In [7]:
quran['separated_text'] = quran['text'].str.split(' ')
quran['word_count'] = quran.separated_text.progress_map(lambda x : len(x))
sura_word_count = quran.groupby('sura').agg({'text':'count','word_count':'sum'}).sort_values('word_count', ascending=False).reset_index().head(15)
fig = px.bar(sura_word_count, x='sura',y='word_count', text='text', title=f'The top {len(sura_word_count)} by sura')
fig.update_traces(customdata  = np.stack((sura_word_count['sura'],sura_word_count["text"]), axis=1), hovertemplate='Sura: %{x}<br>Number of words: %{y}<br>N° of aya\'s: %{customdata[1]}')
fig.show()

100%|██████████| 6230/6230 [00:00<00:00, 778296.12it/s]


<h2>Word Embedding - Word2vec Gensim

In [8]:
# Prepare the texts from the dataframe (lemmatized column)
all_vocab[:10]

[['حمد', 'الله'],
 ['رحمن', 'رحيم'],
 ['ملك', 'يوم', 'دين'],
 ['اياك', 'عبد', 'اياك', 'استعان'],
 ['اهد', 'صراط', 'مستقيم'],
 ['صراط', 'انعم', 'مغضوب', 'ضال'],
 ['الم'],
 ['كتاب', 'ريب', 'هدى', 'متقي'],
 ['آمن', 'غيب', 'أقام', 'صلو', 'رزقن', 'أنفق'],
 ['آمن', 'انزل', 'يك', 'انزل', 'اخر', 'يوقن']]

In [59]:
# Word2vec model
from gensim.models import Word2Vec
old_model = Word2Vec(
    window=6,
    min_count=2,
    workers=8,
    sg=1
)

In [60]:
# Build & train the model
old_model.build_vocab(all_vocab)
old_model.train(all_vocab, total_examples=old_model.corpus_count, epochs=9)

(315580, 405972)

In [61]:
len(old_model.wv.index_to_key)

3007

In [63]:
old_model.wv.most_similar('الله'), old_model.wv.most_similar('محمد'), old_model.wv.most_similar('عيسى')

([('قوي', 0.8867161273956299),
  ('مولى', 0.8820168972015381),
  ('وسع', 0.8815350532531738),
  ('كفى', 0.8704085946083069),
  ('زاد', 0.8695888519287109),
  ('آتى', 0.8677806854248047),
  ('حميد', 0.864806056022644),
  ('انتقام', 0.8608192801475525),
  ('سريع', 0.8594005107879639),
  ('درج', 0.8588250279426575)],
 [('أنذر', 0.947134792804718),
  ('فرق', 0.9454087018966675),
  ('ملة', 0.9437821507453918),
  ('اوتي', 0.942821204662323),
  ('يهودي', 0.9421708583831787),
  ('لسان', 0.9418265223503113),
  ('وحى', 0.9417304396629333),
  ('نبي', 0.9407532215118408),
  ('طائفة', 0.9397169351577759),
  ('ياهل', 0.9395691156387329)],
 [('مريم', 0.977805495262146),
  ('هرو', 0.9685515761375427),
  ('ءاتي', 0.9656402468681335),
  ('اوتي', 0.9653311371803284),
  ('بني', 0.9571349024772644),
  ('يعقوب', 0.9550411701202393),
  ('ابرهيم', 0.9540287852287292),
  ('اذكر', 0.9530991315841675),
  ('اسباط', 0.952572226524353),
  ('اسحق', 0.948630690574646)])

In [260]:
# # Save the model
# old_model.save('quran_w2v.model')

<h4>Use the saved model

In [584]:
# Load the model
from gensim.models import Word2Vec
import pandas as pd
model = Word2Vec.load("quran_w2v.model")

# Most similar words to...
model.wv.most_similar('محمد')

[('توبة', 0.9959176778793335),
 ('تقوى', 0.9958256483078003),
 ('فقير', 0.9956135153770447),
 ('ركع', 0.9953169822692871),
 ('اهتد', 0.9952019453048706),
 ('المن', 0.9950589537620544),
 ('قل', 0.9949949383735657),
 ('سل', 0.9949104189872742),
 ('بلد', 0.9948599338531494),
 ('خشي', 0.9947698712348938)]

In [585]:
# Dataframe of words & vectors
word_vec_df = pd.DataFrame(model.wv.vectors, index=model.wv.key_to_index)
word_vec_df.columns = [f'dim{i+1}' for i in word_vec_df.columns]
word_vec_df

Unnamed: 0,dim1,dim2,dim3,dim4,dim5,dim6,dim7,dim8,dim9,dim10,...,dim91,dim92,dim93,dim94,dim95,dim96,dim97,dim98,dim99,dim100
الله,-0.581672,0.506454,0.260383,0.474504,0.130020,-0.762126,0.271873,0.922216,-0.038793,-0.699522,...,0.791490,-0.287693,-0.204714,0.013217,0.134877,0.597489,0.233000,-0.471277,-0.591226,0.298798
قال,-0.160204,0.288215,-0.017648,0.135081,0.000573,-0.890786,-0.190906,0.800500,0.042703,-0.551774,...,0.887152,-0.189887,0.054013,-0.096380,0.601814,0.378610,0.080101,-0.459989,-0.238983,-0.069465
ارض,-0.543904,0.228231,0.386849,0.076920,-0.062351,-0.206321,0.329613,0.796125,-0.419521,-0.125212,...,0.784184,0.009199,0.038003,-0.210601,0.264895,0.720293,0.216214,-0.633463,-0.061126,0.180519
كفر,0.269946,0.066882,-0.183793,0.206383,0.016098,-0.661337,0.273203,0.663339,-0.419699,-0.340568,...,0.717427,0.167538,-0.195613,0.182291,0.405093,0.136319,-0.112406,-0.381449,-0.210813,-0.143939
يوم,0.103223,-0.399507,-0.175135,0.066671,-0.577980,-0.546201,0.520201,0.641276,-0.862847,-0.274719,...,0.802177,0.441532,-0.271336,0.084851,0.497773,0.136686,-0.141118,-0.270648,-0.005923,-0.197282
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
رئاء,-0.009282,0.054321,-0.023433,0.072457,0.045702,-0.246662,0.087323,0.259920,-0.164073,-0.135572,...,0.248764,0.038674,-0.062862,0.034049,0.189084,0.121789,0.017085,-0.175613,-0.020052,0.023801
تخشو,-0.059468,0.125097,-0.027185,0.149061,0.034175,-0.439266,0.124443,0.496678,-0.212350,-0.239060,...,0.488067,0.054553,-0.071305,0.061725,0.346669,0.270369,0.026670,-0.340076,-0.040113,0.041190
ذلول,-0.071529,0.136534,-0.012143,0.177681,0.031918,-0.462784,0.124123,0.531884,-0.226292,-0.263073,...,0.538110,0.061405,-0.086424,0.063268,0.362429,0.294729,0.025914,-0.370077,-0.039035,0.028635
أثار,-0.080403,0.142594,-0.010289,0.184723,0.054877,-0.548327,0.147540,0.635229,-0.260275,-0.300595,...,0.645035,0.055452,-0.089239,0.088591,0.452942,0.377966,0.045080,-0.449009,-0.041404,0.045157


<h3>A class that computes and plots PCA & UMAP embeddings by specific words or by all words in 3D word embedding

In [1405]:
import pandas as pd
from gensim.models import Word2Vec
import plotly.express as px
from sklearn.decomposition import PCA
from umap.umap_ import UMAP
import warnings
warnings.filterwarnings('ignore')

class WordVec():
    """This class serves for visualizing PCA & UMAP based on search text. Note that it may not work with other types of dataframes or may take quite the time to load larger datasets"""
    
    def __init__(self):
        # <--------- Main model and dataframe --------->
        self.gmodel = Word2Vec.load('quran_w2v.model')
        self.vectors = self.gmodel.wv.vectors
        self.words = self.gmodel.wv.index_to_key
        self.df = pd.DataFrame(self.gmodel.wv.vectors, index=self.gmodel.wv.key_to_index)
        self.df = self.df.reset_index().rename(columns={'index':'word'})
        # PCA
        pca = PCA(n_components=3)
        df = self.df.set_index('word')
        pca_data = pca.fit_transform(df)
        self.pca_df = pd.DataFrame(pca_data, index=df.index, columns=['dim1','dim2','dim3'])
        # UMAP
        reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=3, metric='cosine') # cosine gave better results than euclidean
        df = self.df.set_index('word')
        umap_data = reducer.fit_transform(df)
        self.umap_df = pd.DataFrame(umap_data, index=self.df['word'], columns=['dim1','dim2','dim3']).reset_index().rename(columns={'index':'word'})
        

    def flatten(self, l): # <--- Flatten list of lists
        return [item if type(sublist) == list else sublist for sublist in l for item in sublist]
        
    def find_most_similar(self, word): # <--- Most similar words
        try:
            list_words = []
            if type(word) == list:
                for w in word:
                    list_words.append(list(np.stack(model.wv.most_similar(w), axis=1)[0]) + [w])
                return self.flatten(list_words)
            else:
                word = [word]
                return list(np.stack(model.wv.most_similar(word), axis=1)[0]) + word
        except:
            return
    
    def map_ref(self, value, dictionary): # <--- Map the dictionary for the right keys...
        for k, v in dictionary.items():
            if value in v:
                return k
    
    def df_most_similar(self, word): # <--- Return dataframe of vectors of the most similar words
        try:
            df = self.df
            if type(word) == list:
                lookup = []
                reference = {}
                for w in word:
                    for sim in find_most_similar(w):
                        lookup.append(sim)
                        reference[sim] = w
                df = df[df.word.isin(lookup)]
                reference = {v:[i for i in reference.keys() if reference[i] == v ] for k,v in reference.items()}
                df['ref'] = df['word'].progress_map(lambda x : map_ref(x, reference))
                return df
            else:
                df = df[df.word.isin(self.find_most_similar(word))]
                df['ref'] = word
                return df
        except:
            return 'Try some other words'
            
    def df_most_similar_(self, df, word):
        try:
            if type(word) == list:
                lookup = []
                reference = {}
                for w in word:
                    for sim in find_most_similar(w):
                        lookup.append(sim)
                        reference[sim] = w
                df = df[df.word.isin(lookup)]
                reference = {v:[i for i in reference.keys() if reference[i] == v ] for k,v in reference.items()}
                df['ref'] = df['word'].progress_map(lambda x : self.map_ref(x, reference))
                return df
            else:
                df = df[df.word.isin(find_most_similar(word))]
                df['ref'] = word
                return df
        except Exception as e:
            return f'Try some other words. Error: {e}'
    
    def plot_pca_all(self): # <--- PCA
        fig_pca = px.scatter_3d(self.pca_df, x='dim1', y='dim2', z='dim3', text=self.pca_df.index, width=1000)
        fig_pca.update_layout(margin=dict(l=0, r=0, b=0, t=0))
        fig_pca.show()
        
    def plot_pca_similar(self, words): # <--- PCA Similar words to particular word(s) plot
        df = self.df_most_similar_(self.pca_df.reset_index(), words)
        fig_pca = px.scatter_3d(df, x='dim1', y='dim2', z='dim3', color='ref', text='word', width=1000)
        fig_pca.update_layout(margin=dict(l=0, r=0, b=0, t=0))
        # fig_pca.show()
        return fig_pca
        
    def plot_umap_all(self): # <--- UMAP (all is not very informative)
        fig_umap = px.scatter_3d(self.umap_df, x='dim1', y='dim2', z='dim3', text='word', width=1000)
        fig_umap.update_layout(margin=dict(l=0, r=0, b=0, t=0))
        fig_umap.show()
    
    def plot_umap_similar(self, words):
        df = self.df_most_similar_(self.umap_df, words)
        fig_umap = px.scatter_3d(df, x='dim1',y='dim2',z='dim3',text='word',color='ref', width=1000)
        fig_umap.update_layout(margin=dict(l=0, r=0, b=0, t=0))
        # fig_umap.show()
        return fig_umap

w2v = WordVec()

In [1369]:
w2v.plot_umap_similar(['الله','محمد'])

100%|██████████| 20/20 [00:00<?, ?it/s]


In [1244]:
w2v.plot_pca_similar(['محمد','جنة','مريم','الله'])

100%|██████████| 40/40 [00:00<?, ?it/s]


<h3>Web App for the visualizations (data hosted on github)

In [None]:
# # <<<<< To be hosted on Github >>>>>
# pd.read_csv((r'CSVs_&_other_files\pca_df.csv'))
# w2v.umap_df.to_csv(r'CSVs_&_other_files\umap_df.csv')

In [1377]:
# For flattening list of lists
def flatten(t):
    return [item if type(sublist) == list else sublist for sublist in t for item in sublist]

# Return list of the most similar words
def find_most_similar(word):
    lists = []
    if type(word) == list:
        for w in word:
            lists.append(list(np.stack(model.wv.most_similar(w), axis=1)[0]))
        return flatten(lists)
    else:
        word = [word]
        return list(np.stack(model.wv.most_similar(word), axis=1)[0])
# Test
find_most_similar(['الله', 'جنة'])

['اعلم',
 'فان',
 'استغفر',
 'حليم',
 'سميع',
 'كفى',
 'قدير',
 'غفر',
 'غيب',
 'عباد',
 'دخل',
 'عدن',
 'جهنم',
 'جزاء',
 'خلد',
 'فوز',
 'نعيم',
 'سي',
 'مغفرة',
 'عقبى']

In [1380]:
# Map the right words looked for, for each word
def map_ref(x, d):
    for k, v in d.items():
        if x in v:
            return k

# Return a dataframe of vectors for the target word's most similar words
def lookup_df(df, look_for):
    if type(look_for) == list:
        lookup = []
        reference = {}
        for w in look_for:
            for sim in find_most_similar(w):
                lookup.append(sim)
                reference[sim] = w
        df = df[df.word.isin(lookup)]
        reference = {v:[i for i in reference.keys() if reference[i] == v ] for k,v in reference.items()}
        df['ref'] = df['word'].progress_map(lambda x : map_ref(x, reference))
        return df
    else:
        df = df[df.word.isin(find_most_similar(look_for))]
        df['ref'] = look_for
        return df

# Test
look_for = ['محمد','جنة','عيسى']
lookup_df(umap_df, look_for)

100%|██████████| 30/30 [00:00<?, ?it/s]


Unnamed: 0.1,Unnamed: 0,word,dim1,dim2,dim3,ref
82,82,جهنم,5.188981,11.675048,8.597157,جنة
94,94,دخل,4.788421,11.976626,8.465084,جنة
120,120,خلد,4.76349,11.958874,8.467862,جنة
168,168,مسلم,12.978327,8.965808,8.973332,عيسى
210,210,سي,4.907259,12.013995,8.488084,جنة
225,225,جزاء,5.079129,11.943936,8.515624,جنة
265,265,خشي,12.739668,10.241031,8.544175,محمد
275,275,مريم,12.977746,9.087133,8.944938,عيسى
287,287,مغفرة,4.997922,12.008388,8.499711,جنة
350,350,قل,12.908656,9.99468,8.734095,محمد


In [1401]:
w2v.plot_umap_similar('الله')

<h1>Draft

<h5>PCA

In [586]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)

pca_data = pca.fit_transform(word_vec_df)
pca_w2v_df = pd.DataFrame(pca_data, index=word_vec_df.index).reset_index()
pca_w2v_df.columns = ['word', 'dim1','dim2','dim3']
pca_w2v_df.head()

Unnamed: 0,word,dim1,dim2,dim3
0,الله,1.232349,-0.63009,1.854579
1,قال,0.973762,-1.262608,-0.071664
2,ارض,1.323124,0.743003,2.682454
3,كفر,1.032295,0.699143,-0.670529
4,يوم,1.338632,2.789065,-0.443455


In [1057]:
find_most_similar('محمد')

['توبة', 'تقوى', 'فقير', 'ركع', 'اهتد', 'المن', 'قل', 'سل', 'بلد', 'خشي']

In [1152]:
# Search for words (change the look_for value)
look_for = ['الله','محمد', 'جنة']
fig_pca = px.scatter_3d(lookup_df(pca_w2v_df, look_for), x='dim1', y='dim2', z='dim3', text='word', width=1000,color='ref')
fig_pca.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig_pca.show()

100%|██████████| 30/30 [00:00<?, ?it/s]


<h3>Umap

In [325]:
# Umap import and setup
import umap.umap_ as umap
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=3, metric='cosine') # cosine gave better results than euclidean

# Main reduced_dataframe
umap_data = reducer.fit_transform(word_vec_df)
umap_df = pd.DataFrame(umap_data, index=word_vec_df.index, columns=['dim1','dim2','dim3']).reset_index().rename(columns={'index':'word'})

In [1133]:
# 3D Plot
look_for = ['الله','محمد']
fig_umap = px.scatter_3d(lookup_df(umap_df, look_for), x='dim1', y='dim2', z='dim3', text='word', width=1000, color='ref')
fig_umap.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig_umap.show()

اعلم الله
فان الله
استغفر الله
حليم الله
سميع الله
كفى الله
قدير الله
غفر الله
غيب الله
عباد الله
توبة محمد
تقوى محمد
فقير محمد
ركع محمد
اهتد محمد
المن محمد
قل محمد
سل محمد
بلد محمد
خشي محمد


100%|██████████| 20/20 [00:00<?, ?it/s]
