In [None]:
# building machine learning system

In [None]:
# install libraries

In [1]:
!pip install tensorflow numpy

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# import libraries

In [2]:
import tensorflow as tf
import numpy as np
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Bidirectional, Concatenate



In [None]:
# trying out on some example

In [None]:
# preparing an example data to test on - pattern is similar to the newsQA dataset

In [None]:
# context

In [3]:
context = """
Python is a high-level, interpreted programming language.
It was created by Guido van Rossum and first released in 1991.
It is widely used for web development, data science, AI, and more.
"""

In [None]:
# english questions and english answers - this is example

In [4]:
questions = [
    "Who created Python?",
    "When was Python first released?",
    "What is Python used for?"
]

answers_en = [
    "Guido van Rossum",
    "1991",
    "web development data science AI and more"
]

In [None]:
# english answer is to be translated to french

In [5]:
answers_fr = [
    "Guido van Rossum",
    "1991",
    "développement web science des données IA et plus"
]

In [None]:
# preprocessing the text - coverting the letters in lower letters, removing numbers, symbols.

In [8]:
import string

def processed_text(text):
    text = text.lower()  # lowercase
    text = text.translate(str.maketrans('', '', string.punctuation)) 
    return text

In [9]:
processed_text(context)

'\npython is a highlevel interpreted programming language\nit was created by guido van rossum and first released in 1991\nit is widely used for web development data science ai and more\n'

In [13]:
context_clean = processed_text(context)
questions_clean = [processed_text(q) for q in questions]
answers_clean = [processed_text(a) for a in answers_en]
answers_fr_clean = [processed_text(a) for a in answers_fr]

In [14]:
context_clean

'\npython is a highlevel interpreted programming language\nit was created by guido van rossum and first released in 1991\nit is widely used for web development data science ai and more\n'

In [15]:
questions_clean

['who created python',
 'when was python first released',
 'what is python used for']

In [16]:
answers_clean

['guido van rossum', '1991', 'web development data science ai and more']

In [17]:
answers_fr_clean 

['guido van rossum',
 '1991',
 'développement web science des données ia et plus']

In [None]:
# tokanizers - used from PART A of mandatory task 1 - using embedding layers and neural networks

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

processed_texts = [
    context_clean,                
] + questions_clean + answers_clean 

In [19]:
processed_texts

['\npython is a highlevel interpreted programming language\nit was created by guido van rossum and first released in 1991\nit is widely used for web development data science ai and more\n',
 'who created python',
 'when was python first released',
 'what is python used for',
 'guido van rossum',
 '1991',
 'web development data science ai and more']

In [None]:
# creating the layer using tensorflow

In [20]:
layer = TfidfVectorizer(max_features=100)  
X_layer = layer.fit_transform(processed_texts)

In [21]:
layer_words = layer.get_feature_names_out()
layer_vectors = X_layer.toarray().T

In [22]:
layer_words

array(['1991', 'ai', 'and', 'by', 'created', 'data', 'development',
       'first', 'for', 'guido', 'highlevel', 'in', 'interpreted', 'is',
       'it', 'language', 'more', 'programming', 'python', 'released',
       'rossum', 'science', 'used', 'van', 'was', 'web', 'what', 'when',
       'who', 'widely'], dtype=object)

In [None]:
# QA model - provides answer in english - code is used from the part C mandatory task 1

In [24]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

MODEL = "bert-large-uncased-whole-word-masking-finetuned-squad"
tokenizer_qa = AutoTokenizer.from_pretrained(MODEL)
qa_model = AutoModelForQuestionAnswering.from_pretrained(MODEL)

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [25]:
def get_answer_english(question, context):
    """
    Input: English question + context
    Output: Predicted English answer
    """
    inputs = tokenizer_qa(question, context, return_tensors="pt")
    
    with torch.no_grad():
        outputs = qa_model(**inputs)
    
    start_index = torch.argmax(outputs.start_logits)
    end_index = torch.argmax(outputs.end_logits) + 1  # include last token
    
    answer_tokens = inputs["input_ids"][0][start_index:end_index]
    answer = tokenizer_qa.decode(answer_tokens)
    
    return answer

In [None]:
# using this on our example given above

In [28]:
for q in questions:
    ans = get_answer_english(q, context)
    print(f"Q: {q}")
    print(f"A: {ans}")

Q: Who created Python?
A: guido van rossum
Q: When was Python first released?
A: 1991
Q: What is Python used for?
A: web development, data science, ai, and more


In [None]:
# translation model from english to french - code is used from part B mandatory task 1

In [29]:
from tensorflow import keras
from tensorflow.keras import layers

In [30]:
tokenizer_en = keras.preprocessing.text.Tokenizer(filters='')
tokenizer_en.fit_on_texts([a.lower() for a in answers_en])
input_tensor = tokenizer_en.texts_to_sequences([a.lower() for a in answers_en])
input_tensor = keras.utils.pad_sequences(input_tensor, padding='post')

tokenizer_fr = keras.preprocessing.text.Tokenizer(filters='')
tokenizer_fr.fit_on_texts([a.lower() for a in answers_fr])
target_tensor = tokenizer_fr.texts_to_sequences([a.lower() for a in answers_fr])
target_tensor = keras.utils.pad_sequences(target_tensor, padding='post')

In [None]:
# parameters considered

In [31]:
embedding_dim = 64
units = 128
vocab_inp_size = len(tokenizer_en.word_index) + 1
vocab_tar_size = len(tokenizer_fr.word_index) + 1

In [None]:
# encoder

In [32]:
encoder_inputs = keras.Input(shape=(None,))
enc_emb = layers.Embedding(vocab_inp_size, embedding_dim, mask_zero=True)(encoder_inputs)
encoder_lstm = layers.LSTM(units, return_state=True)
_, state_h, state_c = encoder_lstm(enc_emb)
encoder_states = [state_h, state_c]


In [None]:
# decoder

In [33]:
decoder_inputs = keras.Input(shape=(None,))
dec_emb_layer = layers.Embedding(vocab_tar_size, embedding_dim, mask_zero=True)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm = layers.LSTM(units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)
decoder_dense = layers.Dense(vocab_tar_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [None]:
# full model

In [34]:
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [35]:
decoder_input_data = target_tensor[:, :-1]
decoder_target_data = target_tensor[:, 1:]

In [None]:
# training on smaller dataset

In [36]:
model.fit([input_tensor, decoder_input_data], decoder_target_data, batch_size=2, epochs=50, verbose=0)

<keras.src.callbacks.history.History at 0x26d76f8aba0>

In [None]:
# inference models

In [37]:
encoder_model_inf = keras.Model(encoder_inputs, encoder_states)
decoder_state_input_h = keras.Input(shape=(units,))
decoder_state_input_c = keras.Input(shape=(units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
dec_emb2 = dec_emb_layer(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_outputs2 = decoder_dense(decoder_outputs2)
decoder_states2 = [state_h2, state_c2]
decoder_model_inf = keras.Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs2] + decoder_states2)

reverse_target_index = {v:k for k,v in tokenizer_fr.word_index.items()}

In [None]:
# defining french translation function

In [38]:
def translate_to_french(english_text):
    seq = tokenizer_en.texts_to_sequences([english_text.lower()])
    seq = keras.utils.pad_sequences(seq, maxlen=input_tensor.shape[1], padding='post')
    states_value = encoder_model_inf.predict(seq)
    target_seq = np.zeros((1,1))
    target_seq[0,0] = 1  # start with first token

    translation = ''
    for _ in range(20):
        output_tokens, h, c = decoder_model_inf.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0,-1,:])
        sampled_word = reverse_target_index.get(sampled_token_index, '')
        if sampled_word == '':
            break
        translation += ' ' + sampled_word
        target_seq[0,0] = sampled_token_index
        states_value = [h,c]
    return translation.strip()

In [39]:
for ans in answers_en:
    a = translate_to_french(ans)
    print(a)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 301ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 395ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
van rossum
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

In [None]:
# integration of french and translation with QA model

In [42]:
def ask_question(question, context):
    english_answer = get_answer_english(question, context)
    french_answer = translate_to_french(english_answer)
    return english_answer, french_answer

In [43]:
for q in questions:
    eng_ans, fr_ans = ask_question(q, context)
    print(f"Q: {q}")
    print(f"Answer (English): {eng_ans}")
    print(f"Answer (French): {fr_ans}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
Q: Who created Python?
Answer (English): guido van rossum
Answer (French): van rossum
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
Q: When was Python first released?
Answer (English): 1991
Answer (French): 
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step
Q: What is Python used for?
Answer (English): web development, data science, ai, and more
Answer (French): web


In [None]:
# here, user can give any input(question in english) in the context he wants by typing and get right output(answer in french)

In [46]:
user_q = input("Enter your question in English: ")
eng_ans, fr_ans = ask_question(user_q, context)
print(f"English Answer: {eng_ans}")
print(f"French Answer: {fr_ans}")

Enter your question in English:  When was Python first released?


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 70ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
English Answer: 1991
French Answer: 
