### references and used links:

https://www.w3schools.com/python/python_regex.asp

https://www.regexpal.com

In [1]:
import re
import numpy as np

## 4.a

In [2]:
# Test a function with given inputs list
def test_function(test_input, function):
    for test in test_input:
        result = function(test)
        print(result) 

In [3]:
def regex_email_validator(string):
    pattern = "^[\w\.]+@[a-zA-Z\d]+\.([a-zA-Z]){3}$"
    result = re.search(pattern, string)
    return bool(result)

In [4]:
emails = ["username@domain.old", "username@@domain.old", "user_name@doma9in.old", "username@do.main.old", "username@domain.ookld"]
          
test_function(emails, regex_email_validator)

True
False
True
False
False


In [5]:
def regex_number_validator(string):
    pattern = "^(0098|\+98|0)9([\d]){9}$"
    result = re.search(pattern, string)
    return bool(result)

In [6]:
texts = ["+989143456789", "09143456789", "00989143456789", "+99143456780", "0122948328", "+98943456789"]

test_function(texts, regex_number_validator)

True
True
True
False
False
False


## 4.b.1 - Lemmatizer & Stemmer

In [7]:
# Check the verb root in a dictionary
def dictionary_check(string):
    dictionary = {"رو" : "رفت"
                  ,"رفت" : "رفت"
                  ,"گفت" : "گفت"
                  ,"گوی" : "گفت"
                  ,"خور" : "خورد"
                  ,"خورد" : "خورد"
                  ,"نوشت" : "نوشت"
                 }
    if string in dictionary.keys():
        return dictionary[string]
    else:
        return string
    

# Lemmatizer for one word
def persian_lemmatizer_one_sample(string):
#     prefix_list = ["با","بی","بر","فرا","فرو","نا","هم","می"]
#     suffix_list = ["آسا","ان","انه","تر","ترین","زار","سان","ستان","گار","وار","ها","ات","اون","ین","م","ش","ت","مان","تان","شان","ه"]
    string = string.replace("\u200c", "")
    
    prefix_list = ["می"]
    suffix_list = ["ها"
                   ,"ها"
                   ,"ان"
                   ,"ات"
                   ,"هاند"
                   ,"ه بودند"
                   ,"ید"
                   ,"م"
                   ,"ند"
                   ,"ید"
                   ,"ه"]
    
    exceptions = ["بوستان"]


    if string in exceptions:
        return string
    else:
        for prefix in prefix_list:
            if string.startswith(prefix):
                string = string[len(prefix):]
                break
        for suffix in suffix_list:
            if string.endswith(suffix):
                string = string[:-len(suffix)]
                break
        return dictionary_check(string)

In [8]:
persian_lemmatizer_one_sample('خوردم')

'خورد'

In [9]:
# Stemmer for one word
def persian_stemmer_one_sample(string):
    string = string.replace("\u200c", "")
    
    prefix_list = ["می"]
    suffix_list = ["ها","ان","ات"," اند","بودند","ید","م","ند","ید"]
    exceptions = ["بوستان"]
    
    if string in exceptions:
        return string
    else:
        for prefix in prefix_list:
            if string.startswith(prefix):
                string = string[len(prefix):]
                break
        for suffix in suffix_list:
            if string.endswith(suffix):
                string = string[:-len(suffix)]
                break
        return string

In [10]:
persian_stemmer_one_sample('میگفتند')

'گفت'

In [11]:
# Stemmer and Lemmatizer both together in one function
def lemmatizer_stemmer_one_sample(string):
    string = persian_stemmer_one_sample(string)
    string = persian_lemmatizer_one_sample(string)
    return string

In [12]:
lemmatizer_stemmer_one_sample('رفته اند')

'رفت'

In [13]:
# Simple Tokenizer + "،" ignorer
def tokenizer(sentence):
    sentence = sentence.replace("،", "")
    return sentence.split()

In [14]:
tokenizer("سلام، من رفتم")

['سلام', 'من', 'رفتم']

In [15]:
# Put it all together for one sentence
def tokenizer_lemmatizer_stemmer(sentence):
    out = []
    tokens = tokenizer(sentence)
    for token in tokens:
#         print(token)
        out.append(lemmatizer_stemmer_one_sample(token))
    return out

In [16]:
sentence = "<s> کتاب\u200cها، گفته\u200cبودند، نوشتید، فردا، اتصالات </s>"
tokenizer_lemmatizer_stemmer(sentence)

['<s>', 'کتاب', 'گفت', 'نوشت', 'فردا', 'اتصال', '</s>']

## 4.b.2 - Bi-gram

In [17]:
# First we create one-grams and its dictionary
def onegrams_generator_for_one_sentence(sentence):
    cleaned_tokens = tokenizer_lemmatizer_stemmer(sentence)
    onegrams = []
    for i in range(len(cleaned_tokens)):
        onegrams.append(cleaned_tokens[i])
        
    return onegrams


def add_one_sentence_into_onegram_dictionary(sentence, onegrams_dictionary):
    onegrams = onegrams_generator_for_one_sentence(sentence)
    for onegram in onegrams:
        if onegram in onegrams_dictionary:
            onegrams_dictionary[onegram] += 1
        else:
            onegrams_dictionary[onegram] = 1

            
def all_onegrams_dictionary_generator(sentences_list):
    onegrams_dictionary = {}
    for sentence in sentences_list:
        add_one_sentence_into_onegram_dictionary(sentence, onegrams_dictionary)
        
    return onegrams_dictionary

In [18]:
# Test onegrams functions
# onegrams = onegrams_generator_for_one_sentence(sentence)
# onegrams_dictionary = all_onegrams_dictionary_generator([sentence])

In [19]:
# Generate bi-grams and its dictionary
def bigrams_generator_for_one_sentence(sentence):
    cleaned_tokens = tokenizer_lemmatizer_stemmer(sentence)
    bigrams = []
    for i in range(len(cleaned_tokens)-1):
        bigrams.append("$$$$".join([cleaned_tokens[i], cleaned_tokens[i+1]]))
        
    return bigrams

def bigram_priner(bigrams):
    for bigram in bigrams:
        print(bigram.split("$$$$"))

In [20]:
def add_one_sentence_into_bigram_dictionary(sentence, bigrams_dictionary):
    bigrams = bigrams_generator_for_one_sentence(sentence)
    for bigram in bigrams:
        if bigram in bigrams_dictionary:
            bigrams_dictionary[bigram] += 1
        else:
            bigrams_dictionary[bigram] = 1

def all_bigrams_dictionary_generator(sentences_list):
    bigrams_dictionary = {}
    for sentence in sentences_list:
        add_one_sentence_into_bigram_dictionary(sentence, bigrams_dictionary)
        
    return bigrams_dictionary

In [21]:
# Test bi-grams functions
# bigrams = bigrams_generator_for_one_sentence(sentence)
# bigram_priner(bigrams)
# bigrams_dictionary = all_bigrams_dictionary_generator([sentence])

In [22]:
# Generate one-grams and bi-grams dictionary for trainset:
def onegrams_and_bigrams_dictionary_generator(sentences_list):
    bigrams_dictionary = all_bigrams_dictionary_generator(sentences_list)
    onegrams_dictionary = all_onegrams_dictionary_generator(sentences_list)
    return onegrams_dictionary, bigrams_dictionary

In [23]:
train_set = [
    "<s> کتاب\u200cها، گفته\u200cبودند، نوشتید، فردا، اتصالات </s>",
    "<s> کتاب\u200cها، آن\u200cها، درختان، می\u200cروم، می\u200cگویید </s>",
    "<s> می\u200cخورم، می\u200cگویید، نوشتید، اتصالات </s>",
    "<s> آن\u200cها، درختان، رفته\u200cاند، کتاب\u200cها، اتصالات </s>",
    "<s> کتاب\u200cها، رفته\u200cاند، می\u200cخورم، نوشتید </s>",
]

In [24]:
onegrams_dictionary, bigrams_dictionary = onegrams_and_bigrams_dictionary_generator(train_set)

In [25]:
def dictionary_printer(dictionary, bi=False):
    for key,value in dictionary.items():
        if bi:
            print(tuple((key.split("$$$$"), value)))
        else:
            print(tuple((key,value)))

In [26]:
dictionary_printer(onegrams_dictionary)

('<s>', 5)
('کتاب', 4)
('گفت', 3)
('نوشت', 3)
('فردا', 1)
('اتصال', 3)
('</s>', 5)
('آن', 2)
('درخت', 2)
('رفت', 3)
('خورد', 2)


In [27]:
dictionary_printer(bigrams_dictionary, True)

(['<s>', 'کتاب'], 3)
(['کتاب', 'گفت'], 1)
(['گفت', 'نوشت'], 2)
(['نوشت', 'فردا'], 1)
(['فردا', 'اتصال'], 1)
(['اتصال', '</s>'], 3)
(['کتاب', 'آن'], 1)
(['آن', 'درخت'], 2)
(['درخت', 'رفت'], 2)
(['رفت', 'گفت'], 1)
(['گفت', '</s>'], 1)
(['<s>', 'خورد'], 1)
(['خورد', 'گفت'], 1)
(['نوشت', 'اتصال'], 1)
(['<s>', 'آن'], 1)
(['رفت', 'کتاب'], 1)
(['کتاب', 'اتصال'], 1)
(['کتاب', 'رفت'], 1)
(['رفت', 'خورد'], 1)
(['خورد', 'نوشت'], 1)
(['نوشت', '</s>'], 1)


In [29]:
# Calculate p for one bigram with Laplace smoothing
def one_bigram_probability_calculator(bigram, onegrams_dictionary, bigrams_dictionary, log):
    [first,second] = bigram.split("$$$$")
    V = len(onegrams_dictionary)-1
    
    if bigram in bigrams_dictionary:
        count_both = bigrams_dictionary[bigram]
    else:
        count_both = 0
        
    if first in onegrams_dictionary:
        count_first = onegrams_dictionary[first]
    else:
        count_first = 0  
        
    p = (count_both + 1)/(count_first + V)
    if log:
        print("count_both: ",count_both, "count_first: ",count_first, "V: ", V, "p: ", p)

    return p

        
# Calculate p of one sentence by multiplying all the bigrams one
def one_sentence_probability_calculator(sentence, onegrams_dictionary, bigrams_dictionary, log=False):
    bigrams = bigrams_generator_for_one_sentence(sentence)
    probabilities_list = []
    for bigram in bigrams:
        probabilities_list.append(one_bigram_probability_calculator(bigram, onegrams_dictionary, bigrams_dictionary, log))
        
    final_probability = 1
    for p in probabilities_list:
        final_probability *= p
        
    return final_probability


# Final function to perform on a test set - also generate the dictionaries by receiving the train set as an input
def test_set_probability_calculator(test_set, train_set, log=False):
    onegrams_dictionary, bigrams_dictionary = onegrams_and_bigrams_dictionary_generator(train_set)
    
    for i in range(len(test_set)):
        p = one_sentence_probability_calculator(test_set[i], onegrams_dictionary, bigrams_dictionary, log)
        print("Probability of sentence number {} in the test set is {}".format(i, p))

In [30]:
test_set = [
    "<s> کتاب\u200cها، می\u200cگویید، می\u200cخورم، بوستان، اتصالات </s>",
    "<s> بوستان، رفته\u200cاند، کتاب\u200cها، می\u200cگویید، می\u200cگویید </s>",
]

In [31]:
# Here are the details

In [32]:
one_sentence_probability_calculator(test_set[0], onegrams_dictionary, bigrams_dictionary, log=True)

count_both:  3 count_first:  5 V:  10 p:  0.26666666666666666
count_both:  1 count_first:  4 V:  10 p:  0.14285714285714285
count_both:  0 count_first:  3 V:  10 p:  0.07692307692307693
count_both:  0 count_first:  2 V:  10 p:  0.08333333333333333
count_both:  0 count_first:  0 V:  10 p:  0.1
count_both:  3 count_first:  3 V:  10 p:  0.3076923076923077


7.513853667699822e-06

In [33]:
one_sentence_probability_calculator(test_set[1], onegrams_dictionary, bigrams_dictionary,  log=True)

count_both:  0 count_first:  5 V:  10 p:  0.06666666666666667
count_both:  0 count_first:  0 V:  10 p:  0.1
count_both:  1 count_first:  3 V:  10 p:  0.15384615384615385
count_both:  1 count_first:  4 V:  10 p:  0.14285714285714285
count_both:  0 count_first:  3 V:  10 p:  0.07692307692307693
count_both:  1 count_first:  3 V:  10 p:  0.15384615384615385


1.7339662310076517e-06

In [34]:
# Finally we test all in one function wiht final result

In [35]:
test_set_probability_calculator(test_set, train_set, log=False)

Probability of sentence number 0 in the test set is 7.513853667699822e-06
Probability of sentence number 1 in the test set is 1.7339662310076517e-06
