<a href="https://colab.research.google.com/github/zahr-eddine-elbouzidi/nlp_vector_representation_of_text/blob/main/nlp_vector_rep_of_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Vector Representation of Text Using** ##
- **Word2Vec using Gensim**
- **CountVectorizer**
- **TfidfVectorizer**
- **OneHotEncoder using scikit-learn**
- **One_Hot and CBoW using Keras**

In [None]:
data = [        
"""Perhaps one of the most significant advances made by Arabic mathematics began at this time with the work of al-Khwarizmi, namely
the beginnings of algebra. It is important to understand just how significant this new idea was. It was a revolutionary move away from
the Greek concept of mathematics which was essentially geometry. Algebra was a unifying theory which allowed rational
numbers, irrational numbers, geometrical magnitudes, etc., to all be treated as "algebraic objects". It gave mathematics a whole new
development path so much broader in concept to that which had existed before, and provided a vehicle for future development of the
subject. Another important aspect of the introduction of algebraic ideas was that it allowed mathematics to be applied to itself in a
way which had not happened before.""",

 """ربما كانت أحد أهم التطورات التي قامت بها الرياضيات العربية التي بدأت في هذا الوقت بعمل الخوارزمي  وهي بدايات الجبر،ومن المهم فهم كيف كانت هذه الفكرة الجديدة مهمة، فقد كانت خطوة ثورية بعيدا عن
المفهوم اليوناني للرياضيات التي هي في جوهرها  هندسة، الجبركان نظرية موحدة تتحيح الأعداد الكسرية و الأعداد اللا كسرية ، والمقادير الهندسية و غيرها ، أن تتعامل على أنها أجسام جبرية، و أعطت الرياضيات ككل مسارا جديدًا للتطوربمفهوم 
 أوسع بكثير من الذي كان موجودًا من قبل ، وقدم وسيلة للتنمية في هذا الموضوع مستقبلا .و جانب آخر مهم لإدخال أفكار الجبر و هو أنه سمح بتطبيق الرياضيات على نفسها 
بطريقة  لم تحدث من قبل."""
]

print(data)

['Perhaps one of the most significant advances made by Arabic mathematics began at this time with the work of al-Khwarizmi, namely\nthe beginnings of algebra. It is important to understand just how significant this new idea was. It was a revolutionary move away from\nthe Greek concept of mathematics which was essentially geometry. Algebra was a unifying theory which allowed rational\nnumbers, irrational numbers, geometrical magnitudes, etc., to all be treated as "algebraic objects". It gave mathematics a whole new\ndevelopment path so much broader in concept to that which had existed before, and provided a vehicle for future development of the\nsubject. Another important aspect of the introduction of algebraic ideas was that it allowed mathematics to be applied to itself in a\nway which had not happened before.', 'ربما كانت أحد أهم التطورات التي قامت بها الرياضيات العربية التي بدأت في هذا الوقت بعمل الخوارزمي  وهي بدايات الجبر،ومن المهم فهم كيف كانت هذه الفكرة الجديدة مهمة، فقد كانت خط

In [None]:
!pip install gensim



### **Preprocessing data** ###

In [None]:
import string
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
nltk.download('punkt')
punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' + string.punctuation  #all punctuations 

def delete_punctuations(sentence):
  sentence = sentence.lower()
  my_clean_sentence = ''.join([item for item in sentence if item not in punctuations ])
  return my_clean_sentence
  
def sentence_tokenize(text):
  return word_tokenize(text)
  
tokenized_list = sentence_tokenize(delete_punctuations(data[0]))
print(tokenized_list)
 

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
['perhaps', 'one', 'of', 'the', 'most', 'significant', 'advances', 'made', 'by', 'arabic', 'mathematics', 'began', 'at', 'this', 'time', 'with', 'the', 'work', 'of', 'alkhwarizmi', 'namely', 'the', 'beginnings', 'of', 'algebra', 'it', 'is', 'important', 'to', 'understand', 'just', 'how', 'significant', 'this', 'new', 'idea', 'was', 'it', 'was', 'a', 'revolutionary', 'move', 'away', 'from', 'the', 'greek', 'concept', 'of', 'mathematics', 'which', 'was', 'essentially', 'geometry', 'algebra', 'was', 'a', 'unifying', 'theory', 'which', 'allowed', 'rational', 'numbers', 'irrational', 'numbers', 'geometrical', 'magnitudes', 'etc', 'to', 'all', 'be', 'treated', 'as', 'algebraic', 'objects', 'it', 'gave', 'mathematics', 'a', 'whole', 'new', 'development', 'path', 'so', 'much', 'br

### **Word2Vec** ###

In [None]:
from gensim.models import Word2Vec,keyedvectors

In [None]:
model = Word2Vec([tokenized_list] , min_count=1, size=32)
model.most_similar("important")

  


[('namely', 0.34238022565841675),
 ('beginnings', 0.32131820917129517),
 ('numbers', 0.2754685878753662),
 ('just', 0.26540860533714294),
 ('algebraic', 0.2609509229660034),
 ('of', 0.2531968355178833),
 ('as', 0.218805193901062),
 ('from', 0.20826514065265656),
 ('mathematics', 0.17917057871818542),
 ('in', 0.17618200182914734)]

### **CountVectorizer**
**Dictionary of Word Frequency**

In [31]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
  
 
count_vec = CountVectorizer(ngram_range=(1,1),stop_words='english')
#transform
count_data = count_vec.fit_transform([data[0]])
print(count_vec.get_feature_names())
#create dataframe
my_df=pd.DataFrame(count_data.toarray(),columns=count_vec.get_feature_names()).T
my_df.head(10)



['advances', 'al', 'algebra', 'algebraic', 'allowed', 'applied', 'arabic', 'aspect', 'away', 'began', 'beginnings', 'broader', 'concept', 'development', 'essentially', 'existed', 'future', 'gave', 'geometrical', 'geometry', 'greek', 'happened', 'idea', 'ideas', 'important', 'introduction', 'irrational', 'just', 'khwarizmi', 'magnitudes', 'mathematics', 'new', 'numbers', 'objects', 'path', 'provided', 'rational', 'revolutionary', 'significant', 'subject', 'theory', 'time', 'treated', 'understand', 'unifying', 'vehicle', 'way', 'work']


Unnamed: 0,0
advances,1
al,1
algebra,2
algebraic,2
allowed,2
applied,1
arabic,1
aspect,1
away,1
began,1


**Arabic sentence**

In [None]:

count_vec=CountVectorizer(token_pattern=r"(?u)\b\w\w+\b")
arr = count_vec.fit_transform([data[1]]).toarray()
print(arr)
print('\nvocabulary list:\n')
for key,value in count_vec.vocabulary_.items():
    print(key,value)

[[1 1 1 1 1 1 1 1 1 1 2 1 3 2 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 3 1 2 1 3 1 1 1 1 1 1 1 1 1 1 3 1 1 1
  1 1 1 2 1 1 1 1 1 1 1 1 1]]

vocabulary list:

ربما 46
كانت 57
أحد 2
أهم 8
التطورات 11
التي 12
قامت 54
بها 36
الرياضيات 18
العربية 19
بدأت 30
في 53
هذا 75
الوقت 27
بعمل 33
الخوارزمي 16
وهي 84
بدايات 31
الجبر 13
ومن 83
المهم 24
فهم 52
كيف 60
هذه 76
الفكرة 20
الجديدة 15
مهمة 70
فقد 51
خطوة 45
ثورية 40
بعيدا 34
عن 49
المفهوم 23
اليوناني 28
للرياضيات 64
هي 79
جوهرها 44
هندسة 77
الجبركان 14
نظرية 73
موحدة 72
تتحيح 37
الأعداد 10
الكسرية 21
اللا 22
كسرية 58
والمقادير 80
الهندسية 26
غيرها 50
أن 5
تتعامل 38
على 48
أنها 7
أجسام 1
جبرية 42
أعطت 3
ككل 59
مسارا 66
جديد 43
للتطوربمفهوم 62
أوسع 9
بكثير 35
من 68
الذي 17
كان 56
موجود 71
قبل 55
وقدم 82
وسيلة 81
للتنمية 63
الموضوع 25
مستقبلا 67
جانب 41
آخر 0
مهم 69
لإدخال 61
أفكار 4
هو 78
أنه 6
سمح 47
بتطبيق 29
نفسها 74
بطريقة 32
لم 65
تحدث 39


In [None]:
print(count_data)

### **TfidfVectorizer**

In [30]:
	
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

 
#define tf-idf
tf_idf_vec = TfidfVectorizer(use_idf=True, 
                        smooth_idf=False,  
                        ngram_range=(1,1),stop_words='english')
#transform
tf_idf_data = tf_idf_vec.fit_transform([data[0]])
 
#create dataframe
tf_idf_dataframe=pd.DataFrame(tf_idf_data.toarray(),columns=tf_idf_vec.get_feature_names()).T
tf_idf_dataframe.head(10)
 
  

Unnamed: 0,0
advances,0.105409
al,0.105409
algebra,0.210819
algebraic,0.210819
allowed,0.210819
applied,0.105409
arabic,0.105409
aspect,0.105409
away,0.105409
began,0.105409


### **CBoW** ###
**Bag of Words Model with Keras Deep Learning package**

In [76]:
from keras.preprocessing.text import Tokenizer

model = Tokenizer() ## init tokenizer
model.fit_on_texts(data[0]) ## change this data[0] to 1 if you want insert arabic text
print(f'Key : {list(model.word_index.keys())}') # Print Keys 



# summarize what was learned
print(model.word_counts)
print(model.document_count)
print(model.word_index)
print(model.word_docs)


#create bag of words representation 
text_to_vector = model.texts_to_matrix(data[0], mode='count')
my_cbow_result = pd.DataFrame(text_to_vector)
my_cbow_result



 

Key : ['e', 'a', 't', 'i', 'o', 'n', 'h', 'r', 's', 'c', 'm', 'l', 'd', 'w', 'b', 'f', 'p', 'g', 'u', 'y', 'v', 'k', 'j', 'z', 'x']
OrderedDict([('p', 15), ('e', 77), ('r', 33), ('h', 35), ('a', 72), ('s', 33), ('o', 45), ('n', 38), ('f', 16), ('t', 67), ('m', 25), ('i', 52), ('g', 15), ('c', 26), ('d', 22), ('v', 8), ('b', 17), ('y', 9), ('w', 20), ('k', 3), ('l', 24), ('z', 1), ('u', 12), ('j', 3), ('x', 1)])
814
{'e': 1, 'a': 2, 't': 3, 'i': 4, 'o': 5, 'n': 6, 'h': 7, 'r': 8, 's': 9, 'c': 10, 'm': 11, 'l': 12, 'd': 13, 'w': 14, 'b': 15, 'f': 16, 'p': 17, 'g': 18, 'u': 19, 'y': 20, 'v': 21, 'k': 22, 'j': 23, 'z': 24, 'x': 25}
defaultdict(<class 'int'>, {'p': 15, 'e': 77, 'r': 33, 'h': 35, 'a': 72, 's': 33, 'o': 45, 'n': 38, 'f': 16, 't': 67, 'm': 25, 'i': 52, 'g': 15, 'c': 26, 'd': 22, 'v': 8, 'b': 17, 'y': 9, 'w': 20, 'k': 3, 'l': 24, 'z': 1, 'u': 12, 'j': 3, 'x': 1})


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
809,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
810,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
811,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
812,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### **OneHotEncoder from Sklearn and One_Hot from Keras** ###

In [82]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

values = np.array(tokenized_list)
print(values)
# integer encode
label_encoder = LabelEncoder()
my_int_encoded = label_encoder.fit_transform(values)
print(my_int_encoded)
# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
my_int_encoded = my_int_encoded.reshape(len(my_int_encoded), 1)
onehot_encoded_values = onehot_encoder.fit_transform(my_int_encoded)
print(onehot_encoded_values)
inverted_values = label_encoder.inverse_transform([np.argmax(onehot_encoded_values[0, :])])
print(inverted_values)

['perhaps' 'one' 'of' 'the' 'most' 'significant' 'advances' 'made' 'by'
 'arabic' 'mathematics' 'began' 'at' 'this' 'time' 'with' 'the' 'work'
 'of' 'alkhwarizmi' 'namely' 'the' 'beginnings' 'of' 'algebra' 'it' 'is'
 'important' 'to' 'understand' 'just' 'how' 'significant' 'this' 'new'
 'idea' 'was' 'it' 'was' 'a' 'revolutionary' 'move' 'away' 'from' 'the'
 'greek' 'concept' 'of' 'mathematics' 'which' 'was' 'essentially'
 'geometry' 'algebra' 'was' 'a' 'unifying' 'theory' 'which' 'allowed'
 'rational' 'numbers' 'irrational' 'numbers' 'geometrical' 'magnitudes'
 'etc' 'to' 'all' 'be' 'treated' 'as' 'algebraic' 'objects' 'it' 'gave'
 'mathematics' 'a' 'whole' 'new' 'development' 'path' 'so' 'much'
 'broader' 'in' 'concept' 'to' 'that' 'which' 'had' 'existed' 'before'
 'and' 'provided' 'a' 'vehicle' 'for' 'future' 'development' 'of' 'the'
 'subject' 'another' 'important' 'aspect' 'of' 'the' 'introduction' 'of'
 'algebraic' 'ideas' 'was' 'that' 'it' 'allowed' 'mathematics' 'to' 'be'
 'appl

In [79]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import text_to_word_sequence
# define the document
words = set(text_to_word_sequence(data[0]))
vocab_size = len(words)
print(vocab_size)
# integer encode the document
result = one_hot(data[0], round(vocab_size))
print(result)


84
[47, 51, 73, 83, 51, 33, 26, 31, 58, 82, 47, 68, 54, 41, 14, 26, 83, 4, 73, 12, 29, 60, 83, 59, 73, 68, 65, 41, 63, 25, 19, 62, 57, 33, 41, 3, 13, 25, 65, 25, 5, 15, 25, 72, 37, 83, 29, 51, 73, 47, 23, 25, 41, 39, 68, 25, 5, 50, 69, 23, 32, 82, 38, 26, 38, 52, 81, 40, 25, 13, 69, 15, 70, 17, 81, 65, 72, 47, 5, 12, 3, 25, 56, 11, 82, 40, 79, 51, 25, 56, 23, 70, 71, 43, 46, 12, 5, 77, 26, 49, 25, 73, 83, 59, 2, 63, 41, 73, 83, 51, 73, 17, 34, 25, 56, 65, 32, 47, 25, 69, 38, 25, 14, 79, 5, 26, 23, 70, 79, 44, 43]
