In [3]:
import pandas as pd
from keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from transformers import BertTokenizer, TFBertModel
from gensim.models import FastText
import tensorflow as tf
import stanza
import numpy as np
import openai
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [4]:
data = pd.read_csv("Answers islamic - Feuille 1.csv")
data.drop_duplicates(inplace=True)

In [5]:
# Load models
bert_model = TFBertModel.from_pretrained("aubmindlab/bert-base-arabertv02")
bert_tokenizer = BertTokenizer.from_pretrained('aubmindlab/bert-base-arabertv02')
transformer_model = load_model("Transformer_model.h5")




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'bert.embeddings.position_ids', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already u

In [6]:
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")  # Load GPT-2 tokenizer
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")  # Load GPT-2 model


In [14]:
import torch


def generate_feedback(bert_output, student_answer, score):
    # Encode student answer with BERT tokenizer
    bert_input_ids = bert_tokenizer.encode(student_answer, return_tensors='tf')

    # Get BERT embeddings
    with torch.no_grad():
        bert_output = bert_model(bert_input_ids)[0]

    # Construct prompt for GPT-2
    prompt = f"Here is the student answer: {student_answer}. Their score was {score}. Based on this information, provide comprehensive and personalized feedback, highlighting strengths and areas for improvement. Be respectful and encouraging."

    # Encode prompt with GPT-2 tokenizer
    gpt2_input_ids = gpt2_tokenizer.encode(prompt, return_tensors='pt')

    # Generate feedback with GPT-2
    gpt2_output = gpt2_model.generate(gpt2_input_ids, max_length=150, num_beams=5, no_repeat_ngram_size=2)

    # Decode GPT-2 output
    feedback_text = gpt2_tokenizer.decode(gpt2_output[0], skip_special_tokens=True)

    return feedback_text


In [8]:
stanza.download("ar")
nlp = stanza.Pipeline("ar")

def preprocess_text(text):
    doc = nlp(text)
    tokens = [
        word.lemma
        for sent in doc.sentences
        for word in sent.words
        if word.upos != "PUNCT"
    ]
    return tokens

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

2024-01-05 18:21:13 INFO: Downloading default packages for language: ar (Arabic) ...
2024-01-05 18:21:14 INFO: File exists: C:\Users\zakar\stanza_resources\ar\default.zip
2024-01-05 18:21:17 INFO: Finished downloading models and saved to C:\Users\zakar\stanza_resources.
2024-01-05 18:21:17 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.7.0.json:   0%|   …

2024-01-05 18:21:19 INFO: Loading these models for language: ar (Arabic):
| Processor | Package       |
-----------------------------
| tokenize  | padt          |
| mwt       | padt          |
| pos       | padt_charlm   |
| lemma     | padt_nocharlm |
| depparse  | padt_charlm   |
| ner       | aqmar_charlm  |

2024-01-05 18:21:19 INFO: Using device: cpu
2024-01-05 18:21:19 INFO: Loading: tokenize
2024-01-05 18:21:19 INFO: Loading: mwt
2024-01-05 18:21:19 INFO: Loading: pos
2024-01-05 18:21:19 INFO: Loading: lemma
2024-01-05 18:21:19 INFO: Loading: depparse
2024-01-05 18:21:20 INFO: Loading: ner
2024-01-05 18:21:20 INFO: Done loading processors!


In [9]:
data["answers"] = data["answers"].apply(preprocess_text)

In [13]:
data["answers"]

0                   [عيسى, علي, هُوَ, سَلَام]
1         [نَبِيّ, عيسى, عَلَى, هُوَ, سَلَام]
2      [نبي, الله, عيسى, عَلَى, هُوَ, سَلَام]
3                           [نبي, الله, عيسى]
4                              [نَبِيّ, عيسى]
                        ...                  
972                       [لَا, أعرف, جَوَاب]
973                               [لَا, أعرف]
974                              [لَا, عَلِم]
975                                     [لوط]
976                                    [ثمود]
Name: answers, Length: 842, dtype: object

In [16]:
# Initialize the Keras tokenizer
tokenizer = Tokenizer(filters="""'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ""""")
# Fit the tokenizer on your text data
tokenizer.fit_on_texts(data["answers"])
# Convert your text data to sequences of integers
sequences = tokenizer.texts_to_sequences(data["answers"])
# Find the length of the longest sequence
max_sequence_length = max(len(s) for s in sequences)
# Pad your sequences so they all have the same length
sequences = pad_sequences(sequences, max_sequence_length)
# Get a dictionary where the keys are words and the values are their corresponding integer values
word2idx = tokenizer.word_index
# Get the size of your vocabulary
vocab_size = len(word2idx) + 1

In [17]:
# Set the size of your embeddings
EMBEDDING_DIM = 300
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
# Train FastText model
fasttext_model = FastText(data["answers"], vector_size=EMBEDDING_DIM, window=5, min_count=1, workers=4)
fasttext_model.save("fasttext_model2.bin")
# Fill the embedding matrix with FastText vectors
for word, idx in tokenizer.word_index.items():
    if word in fasttext_model.wv.key_to_index:
        embedding_matrix[idx] = fasttext_model.wv[word]
    else:
        print("word not exist in voca ---> " + word)

In [21]:
student_answer = input("Enter student answer: ")
student_answer_ids = tokenizer.texts_to_sequences([student_answer])[0]
student_answer_ids = pad_sequences([student_answer_ids], maxlen=24)
score = transformer_model.predict([student_answer_ids])[0][0]  # Replace with your model's prediction logic
# Encode student answer with BERT tokenizer
bert_input_ids = bert_tokenizer.encode(student_answer, return_tensors='tf')
# Get BERT embeddings
bert_output = bert_model(bert_input_ids)[0]
# Generate feedback
feedback_text = generate_feedback(bert_output, student_answer, score)
print("Generated feedback:", feedback_text)




APIRemovedInV1: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742
