<a href="https://colab.research.google.com/github/zahr-eddine/nlp_preprocessing_data/blob/main/nlp_preprocessing_data_using_nltk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Preprocessing data using NLTK** ##


**Dataset**

In [None]:
dataset = [        
"""Perhaps one of the most significant advances made by Arabic mathematics began at this time with the work of al-Khwarizmi, namely
the beginnings of algebra. It is important to understand just how significant this new idea was. It was a revolutionary move away from
the Greek concept of mathematics which was essentially geometry. Algebra was a unifying theory which allowed rational
numbers, irrational numbers, geometrical magnitudes, etc., to all be treated as "algebraic objects". It gave mathematics a whole new
development path so much broader in concept to that which had existed before, and provided a vehicle for future development of the
subject. Another important aspect of the introduction of algebraic ideas was that it allowed mathematics to be applied to itself in a
way which had not happened before.""",

 """ربما كانت أحد أهم التطورات التي قامت بها الرياضيات العربية التي بدأت في هذا الوقت بعمل الخوارزمي  وهي بدايات الجبر،ومن المهم فهم كيف كانت هذه الفكرة الجديدة مهمة، فقد كانت خطوة ثورية بعيدا عن
المفهوم اليوناني للرياضيات التي هي في جوهرها  هندسة، الجبركان نظرية موحدة تتحيح الأعداد الكسرية و الأعداد اللا كسرية ، والمقادير الهندسية و غيرها ، أن تتعامل على أنها أجسام جبرية، و أعطت الرياضيات ككل مسارا جديدًا للتطوربمفهوم 
 أوسع بكثير من الذي كان موجودًا من قبل ، وقدم وسيلة للتنمية في هذا الموضوع مستقبلا .و جانب آخر مهم لإدخال أفكار الجبر و هو أنه سمح بتطبيق الرياضيات على نفسها 
بطريقة  لم تحدث من قبل."""
]

print(dataset)

['Perhaps one of the most significant advances made by Arabic mathematics began at this time with the work of al-Khwarizmi, namely\nthe beginnings of algebra. It is important to understand just how significant this new idea was. It was a revolutionary move away from\nthe Greek concept of mathematics which was essentially geometry. Algebra was a unifying theory which allowed rational\nnumbers, irrational numbers, geometrical magnitudes, etc., to all be treated as "algebraic objects". It gave mathematics a whole new\ndevelopment path so much broader in concept to that which had existed before, and provided a vehicle for future development of the\nsubject. Another important aspect of the introduction of algebraic ideas was that it allowed mathematics to be applied to itself in a\nway which had not happened before.', 'ربما كانت أحد أهم التطورات التي قامت بها الرياضيات العربية التي بدأت في هذا الوقت بعمل الخوارزمي  وهي بدايات الجبر،ومن المهم فهم كيف كانت هذه الفكرة الجديدة مهمة، فقد كانت خط

In [None]:
!pip install nltk



In [None]:
import string
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')
punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' + string.punctuation  #all punctuations 

'`÷×؛<>_()*&^%][ـ،/:"؟.,\'{}~¦+|!”…“–ـ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

**Remove punctuations**

In [None]:
def delete_punctuations(sentence):
  sentence = sentence.lower()
  my_clean_sentence = ''.join([item for item in sentence if item not in punctuations ])
  return my_clean_sentence

print(delete_punctuations(dataset[0]))

perhaps one of the most significant advances made by arabic mathematics began at this time with the work of alkhwarizmi namely
the beginnings of algebra it is important to understand just how significant this new idea was it was a revolutionary move away from
the greek concept of mathematics which was essentially geometry algebra was a unifying theory which allowed rational
numbers irrational numbers geometrical magnitudes etc to all be treated as algebraic objects it gave mathematics a whole new
development path so much broader in concept to that which had existed before and provided a vehicle for future development of the
subject another important aspect of the introduction of algebraic ideas was that it allowed mathematics to be applied to itself in a
way which had not happened before


**Tokenization**

In [None]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')
def sentence_tokenize(text):
  return word_tokenize(text)
  
tokenized_list = sentence_tokenize(delete_punctuations(dataset[0]))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

**Remove English StopWords**

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')


def delete_english_stopWord(sentence):
  stop_words = [word for word in sentence if word not in stopwords.words('english')]
  return stop_words

clean_sentence = delete_english_stopWord(tokenized_list)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Stemming**

In [None]:
def stem_sentence(token_list):
  p = nltk.PorterStemmer()
  stems_word = [p.stem(item) for item in token_list]
  return stems_word

print(stem_sentence(clean_sentence))
stemmed_sentence = stem_sentence(clean_sentence)

['perhap', 'one', 'signific', 'advanc', 'made', 'arab', 'mathemat', 'began', 'time', 'work', 'alkhwarizmi', 'name', 'begin', 'algebra', 'import', 'understand', 'signific', 'new', 'idea', 'revolutionari', 'move', 'away', 'greek', 'concept', 'mathemat', 'essenti', 'geometri', 'algebra', 'unifi', 'theori', 'allow', 'ration', 'number', 'irrat', 'number', 'geometr', 'magnitud', 'etc', 'treat', 'algebra', 'object', 'gave', 'mathemat', 'whole', 'new', 'develop', 'path', 'much', 'broader', 'concept', 'exist', 'provid', 'vehicl', 'futur', 'develop', 'subject', 'anoth', 'import', 'aspect', 'introduct', 'algebra', 'idea', 'allow', 'mathemat', 'appli', 'way', 'happen']


**Lemmetizing**

In [None]:
nltk.download('wordnet')
def lemmet(token_list):
  wordn = nltk.WordNetLemmatizer()
  l_list = [wordn.lemmatize(item) for item in token_list]
  return l_list

  
l = lemmet(stemmed_sentence)
print(l)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
['perhap', 'one', 'signific', 'advanc', 'made', 'arab', 'mathemat', 'began', 'time', 'work', 'alkhwarizmi', 'name', 'begin', 'algebra', 'import', 'understand', 'signific', 'new', 'idea', 'revolutionari', 'move', 'away', 'greek', 'concept', 'mathemat', 'essenti', 'geometri', 'algebra', 'unifi', 'theori', 'allow', 'ration', 'number', 'irrat', 'number', 'geometr', 'magnitud', 'etc', 'treat', 'algebra', 'object', 'gave', 'mathemat', 'whole', 'new', 'develop', 'path', 'much', 'broader', 'concept', 'exist', 'provid', 'vehicl', 'futur', 'develop', 'subject', 'anoth', 'import', 'aspect', 'introduct', 'algebra', 'idea', 'allow', 'mathemat', 'appli', 'way', 'happen']


In [None]:
full_sentence = ' '.join([w for w in l])
full_sentence

'perhap one signific advanc made arab mathemat began time work alkhwarizmi name begin algebra import understand signific new idea revolutionari move away greek concept mathemat essenti geometri algebra unifi theori allow ration number irrat number geometr magnitud etc treat algebra object gave mathemat whole new develop path much broader concept exist provid vehicl futur develop subject anoth import aspect introduct algebra idea allow mathemat appli way happen'

**Preprocessing Arabic using NLTK package**

In [None]:
def preprocess(text):
    #remove punctuations
    translator = str.maketrans('', '', punctuations)
    text = text.translate(translator)
 
    #remove longation
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)

 
    return text
  
print(preprocess(dataset[1]))
preprocessing_data = preprocess(dataset[1])

ربما كانت احد اهم التطورات التي قامت بها الرياضيات العربيه التي بدات في هذا الوقت بعمل الخوارزمي  وهي بدايات الجبرومن المهم فهم كيف كانت هذه الفكره الجديده مهمه فقد كانت خطوه ثوريه بعيدا عن
المفهوم اليوناني للرياضيات التي هي في جوهرها  هندسه الجبركان نظريه موحده تتحيح الاعداد الكسريه و الاعداد اللا كسريه  والمقادير الهندسيه و غيرها  ان تتعامل علي انها اجسام جبريه و اعطت الرياضيات ككل مسارا جديدًا للتطوربمفهوم 
 اوسع بكثير من الذي كان موجودًا من قبل  وقدم وسيله للتنميه في هذا الموضوع مستقبلا و جانب اخر مهم لادخال افكار الجبر و هو انه سمح بتطبيق الرياضيات علي نفسها 
بطريقه  لم تحدث من قبل


In [None]:
pip install Arabic-Stopwords

Collecting Arabic-Stopwords
[?25l  Downloading https://files.pythonhosted.org/packages/7c/9e/40ee9b10f98b23b32bb7ca3f229ae78ae4379ebcb03cbb7b9e7399686ad8/Arabic_Stopwords-0.3-py3-none-any.whl (353kB)
[K     |████████████████████████████████| 358kB 8.1MB/s 
[?25hCollecting pyarabic>=0.6.2
[?25l  Downloading https://files.pythonhosted.org/packages/7b/e2/46728ec2f6fe14970de5c782346609f0636262c0941228f363710903aaa1/PyArabic-0.6.10.tar.gz (108kB)
[K     |████████████████████████████████| 112kB 45.8MB/s 
[?25hBuilding wheels for collected packages: pyarabic
  Building wheel for pyarabic (setup.py) ... [?25l[?25hdone
  Created wheel for pyarabic: filename=PyArabic-0.6.10-cp37-none-any.whl size=113324 sha256=b73ed0a25db64034f103e3bc74adb0f42ad124c81bdd3b7baa2f90c8480d6b1a
  Stored in directory: /root/.cache/pip/wheels/10/b8/f5/b7c1a50e6efb83544844f165a9b134afe7292585465e29b61d
Successfully built pyarabic
Installing collected packages: pyarabic, Arabic-Stopwords
Successfully installed A

In [None]:
import arabicstopwords.arabicstopwords as stp
dir(stp)
print(len(stp.stopwords_list()))
stop_glob_words = stp.stopwords_list()

13629


In [None]:
stopwords_list = stopwords.words('arabic')
for item in stop_glob_words:
  stopwords_list.append(item) if item not in stopwords_list else stopwords_list

len(stopwords_list)

13636

**Remove all arabic stopwords**

In [None]:
def delete_arabic_stopWord(text):
  stop_words = [word for word in text if word not in stopwords_list]
  return stop_words

In [None]:
token_arabic = word_tokenize(preprocessing_data)
text_without_arabic_stop_word = delete_arabic_stopWord(token_arabic)
full_arabic_text = ' '.join([x for x in text_without_arabic_stop_word])
print(full_arabic_text)

ربما كانت احد اهم التطورات قامت الرياضيات العربيه بدات الوقت بعمل الخوارزمي بدايات الجبرومن المهم كانت الفكره الجديده مهمه كانت خطوه ثوريه بعيدا المفهوم اليوناني للرياضيات جوهرها هندسه الجبركان نظريه موحده تتحيح الاعداد الكسريه الاعداد اللا كسريه والمقادير الهندسيه ان تتعامل انها اجسام جبريه اعطت الرياضيات مسارا جديدًا للتطوربمفهوم اوسع بكثير موجودًا وقدم وسيله للتنميه الموضوع مستقبلا جانب اخر مهم لادخال افكار الجبر انه سمح بتطبيق الرياضيات بطريقه تحدث
