In [None]:
from flask import Flask, flash, request, redirect, url_for, jsonify, send_file, make_response,session
from flask_cors import CORS, cross_origin
from roboflow import Roboflow
import torch
import urllib
import cv2
import base64
import numpy as np
import io
from PIL import Image
from matplotlib import pyplot as plt
import os
from transformers import TrOCRProcessor, AutoFeatureExtractor, AutoTokenizer, VisionEncoderDecoderModel, VitsModel, VitsTokenizer
from gtts import gTTS
import re
import torchaudio
import uuid

feature_extractor = AutoFeatureExtractor.from_pretrained("google/vit-base-patch16-384")
decoder_tokenizer = AutoTokenizer.from_pretrained("urduhack/roberta-urdu-small")
processor =TrOCRProcessor(feature_extractor=feature_extractor, tokenizer=decoder_tokenizer)
loaded_model = VisionEncoderDecoderModel.from_pretrained("./Saved-Model/").to("cuda")
audio_model = VitsModel.from_pretrained("facebook/mms-tts-urd-script_arabic").to("cuda")
audio_tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-urd-script_arabic", language = "urdu")

ALLOWED_EXTENSIONS = {'txt', 'pdf', 'png', 'jpg', 'jpeg', 'gif'}

app = Flask(__name__)
CORS(app, support_credentials=True)
app.config['CORS_HEADERS'] = 'Content-Type'

app.secret_key = 'my_secret_key'

rf = Roboflow(api_key="sMDLRU6MwJA4ONNM61TU")
project = rf.workspace("urduocr").project("line-detection-urduocr")
model = project.version(2).model

@app.route('/', methods=['GET', 'POST'])
@cross_origin(supports_credentials=True)
def hello():
    return jsonify({"method": "done"})

@app.route('/detection', methods=['GET', 'POST'])
@cross_origin(supports_credentials=True)
def detect_lines():
    if request.method == 'POST':
        file = request.get_json()
        f = urllib.request.urlopen(file["file"])
        myfile = f.read()
        id_user = str(uuid.uuid4())
        if f.info()['content-type'].split("/")[0] == "image":
            myfile = np.array(Image.open(io.BytesIO(myfile)))
            cv2.imwrite(f"{id_user}.jpg", myfile)
            detect = model.predict(f"{id_user}.jpg", confidence=10, overlap=30)
            data = detect.json()
            data['predictions'].sort(key=lambda x: x['y'])
            detect.save(f"{id_user}-save_predict.jpg")
            # Get the image bytes
            image_bytes = open(f"{id_user}-save_predict.jpg", 'rb').read()
            # Encode the image bytes to base64
            image_base64 = base64.b64encode(image_bytes).decode()
            os.remove(f"{id_user}-save_predict.jpg")
            return jsonify({"predicted_image": image_base64, "is_image": True, "my_data": data, "user_id": id_user})

        
@app.route('/extraction', methods=['GET', 'POST'])
@cross_origin(supports_credentials=True)
def extract_text():
    if request.method == 'POST':
        user_id = request.get_json()["user_id"]
        image = cv2.imread(f"{user_id}.jpg")
        count = 1
        data = request.get_json()["my_data"]
        generated_texts = []
        actual_generated_text = ""
        text_lines = []
        if len(data["predictions"]) == 0:
            english_pattern = re.compile(r'[a-zA-Z]')
            pixel_values_test1 = processor.feature_extractor(image, return_tensors="pt").pixel_values.to("cuda")
            generated_ids_test1 = loaded_model.generate(pixel_values_test1, output_scores=True, return_dict_in_generate=True)
            
            # Get the confidence score
            confidence_scores = torch.exp(generated_ids_test1.sequences_scores)

            # Print the confidence score
            score = confidence_scores.max().item()
            
            generated_text_test1 = processor.batch_decode(generated_ids_test1.sequences, skip_special_tokens=True)[0]
            generated_text_test1 = re.sub(english_pattern, '', generated_text_test1)
            main_text = ""
            if len(generated_text_test1) > 0:
                if generated_text_test1.strip(" ").startswith("(''") :
                    main_text = generated_text_test1.strip(" ")[3:]
                elif generated_text_test1.strip(" ").startswith("(") :
                    main_text = generated_text_test1.strip(" ").lstrip("(")
                elif generated_text_test1.strip(" ").startswith("اور:4]"):
                    main_text = generated_text_test1.strip(" ")[7:]
                elif generated_text_test1.strip(" ").startswith("''") :
                    main_text = generated_text_test1.strip(" ").lstrip("''")
                elif generated_text_test1.strip(" ").startswith("اور"):
                    main_text = generated_text_test1.strip(" ")[3:]
                else:
                    main_text = generated_text_test1.strip(" ")
    #                 im_pil
    #                 print(generated_text_test1)
            else:
                main_text = generated_text_test1
            generated_texts.append(main_text)
            text_lines.append({"index": count, "text": main_text, "score": round(score, 2)})
        for i in data["predictions"]:
            if i['class'] == "Line":
                height = i['height']
                width = i['width']
                x = i['x']
                y = i['y']

                # Padded ROI
#                 roi_x = int(x - (width + 30)/ 2)
#                 roi_y = int(y - (height + 30)/ 2)
#                 roi_width = int(width + 30)
#                 roi_height = int(height + 30)
                
                roi_x = int(x - (width + 5)/ 2)
                roi_y = int(y - (height + 5)/ 2)
                roi_width = int(width + 5)
                roi_height = int(height + 5)

                # Non-Padded ROI
#                 roi_x = int(x - (width)/ 2)
#                 roi_y = int(y - (height)/ 2)
#                 roi_width = int(width)
#                 roi_height = int(height)


                roi = image[roi_y:roi_y+roi_height, roi_x:roi_x+roi_width]
                path = f"cropped_images/cropped_{count}.jpg"
#                 cropImage_list.append(path)
#                 cv2.imwrite(path, roi)
#                 roi = cv2.cvtColor(roi, cv2.COLOR_BGR2RGB)
#                 im_pil = Image.fromarray(roi).convert('RGB')
                cv2.imwrite(path, roi)
        
                english_pattern = re.compile(r'[a-zA-Z]')
                pixel_values_test1 = processor.feature_extractor(roi, return_tensors="pt").pixel_values.to("cuda")
                generated_ids_test1 = loaded_model.generate(pixel_values_test1, output_scores=True, return_dict_in_generate=True)

                # Get the confidence score
                confidence_scores = torch.exp(generated_ids_test1.sequences_scores)

                # Print the confidence score
                score = confidence_scores.max().item()
                
                generated_text_test1 = processor.batch_decode(generated_ids_test1.sequences, skip_special_tokens=True)[0]
                generated_text_test1 = re.sub(english_pattern, '', generated_text_test1)
                main_text = ""
                if len(generated_text_test1) > 0:
                    if generated_text_test1.strip(" ").startswith("(''") :
                        main_text = generated_text_test1.strip(" ")[3:]
                    elif generated_text_test1.strip(" ").startswith("(") :
                        main_text = generated_text_test1.strip(" ").lstrip("(")
                    elif generated_text_test1.strip(" ").startswith("اور:4]"):
                        main_text = generated_text_test1.strip(" ")[7:]
                    elif generated_text_test1.strip(" ").startswith("''") :
                        main_text = generated_text_test1.strip(" ").lstrip("''")
                    elif generated_text_test1.strip(" ").startswith("اور"):
                        main_text = generated_text_test1.strip(" ")[3:]
                    else:
                        main_text = generated_text_test1.strip(" ")
        #                 im_pil
        #                 print(generated_text_test1)
                else:
                    main_text = generated_text_test1
                generated_texts.append(main_text)
                text_lines.append({"index": count, "text": main_text, "score": round(score, 2)})
                count+=1
                
        if len(generated_texts)>0:
            actual_generated_text = "\n".join(generated_texts)
            

        
#         audio_inputs = audio_tokenizer(text=actual_generated_text, return_tensors="pt").to("cuda")
#         audio_output = audio_model(**audio_inputs).waveform
#         torchaudio.save('./save.wav', src=audio_output.cpu().detach(), sample_rate=audio_model.config.sampling_rate)

#         tts = gTTS(text=actual_generated_text, lang="ur")
#         tts.save(f'./audio_lines.mp3')
#         audio_bytes = open('./audio_lines.mp3', 'rb').read()
#         audio_base64 = base64.b64encode(audio_bytes).decode('UTF-8')
#         with open("save.wav", "rb") as file:
#             male_audio_output = base64.b64encode(file.read()).decode('UTF-8')
        session['is_image'] = False
        os.remove(f"{user_id}.jpg")
#         os.remove('./audio_lines.mp3')
#         os.remove('./save.wav')
        audio_base64 = None
        male_audio_output = None
        return jsonify({"generated_text": actual_generated_text, "audio_google": audio_base64, "text_lines": text_lines, "audio_facebook" : male_audio_output})
        
        
        
app.run(debug=False, host='0.0.0.0', port=5000, threaded=True)

The class ViTFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use ViTImageProcessor instead.
The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor` instead.


loading Roboflow workspace...
loading Roboflow project...
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://10.57.136.24:5000
Press CTRL+C to quit
127.0.0.1 - - [06/Nov/2023 10:07:25] "OPTIONS /detection HTTP/1.1" 200 -
127.0.0.1 - - [06/Nov/2023 10:07:35] "POST /detection HTTP/1.1" 200 -
127.0.0.1 - - [06/Nov/2023 10:07:40] "OPTIONS /extraction HTTP/1.1" 200 -
`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.
127.0.0.1 - - [06/Nov/2023 10:07:57] "POST /extraction HTTP/1.1" 200 -
127.0.0.1 - - [06/Nov/2023 10:13:45] "OPTIONS /detection HTTP/1.1" 200 -
127.0.0.1 - - [06/Nov/2023 10:13:53] "POST /detection HTTP/1.1" 200 -
127.0.0.1 - - [06/Nov/2023 10:13:58] "OPTIONS /extraction HTTP/1.1" 200 -
127.0.0.1 - - [06/Nov/2023 10:14:09] "POST /extraction HTTP/1.1" 200 -


In [None]:
# from transformers import VitsModel, VitsTokenizer
# model = VitsModel.from_pretrained("facebook/mms-tts-urd-script_arabic").to("cuda")
# tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-urd-script_arabic", language = "urdu")

In [None]:
# inputs = tokenizer(text="بنانے کے لیے قائداعظم کی رہنمائی میں مسلمانوں نے بہت محنت کی۔ پاکستان", return_tensors="pt").to("cuda")

In [None]:
# output = model(**inputs).waveform

In [None]:
# import torchaudio
# torchaudio.save('./save.wav', src=output.cpu().detach(), sample_rate=model.config.sampling_rate)

In [None]:
# output.cpu().detach()