In [1]:
import pandas as pd
import numpy as np

import os

In [2]:
from tqdm import tqdm

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline



pipe = pipeline("translation", model="facebook/nllb-200-distilled-600M")
# pipe("ارے بھائی، آپ کیسے ہیں؟",src_lang="urd_Arab",tgt_lang="eng_Latn")[0].get('translation_text')

# Dataset

# Urdu Monolingual Corpus

https://lindat.mff.cuni.cz/repository/xmlui/handle/11858/00-097C-0000-0023-65A9-5#

In [3]:
! curl --remote-name-all https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11858/00-097C-0000-0023-65A9-5{/urdu-tagged-corpus.gz}

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
  0  253M    0 1391k    0     0   697k      0  0:06:12  0:00:01  0:06:11  698k
  1  253M    1 4207k    0     0  1404k      0  0:03:05  0:00:02  0:03:03 1404k
  2  253M    2 7151k    0     0  1789k      0  0:02:25  0:00:03  0:02:22 1790k
  3  253M    3  9.8M    0     0  2027k      0  0:02:08  0:00:04  0:02:04 2028k
  5  253M    5 12.7M    0     0  2092k      0  0:02:04  0:00:06  0:01:58 2575k
  5  253M    5 13.3M    0     0  1955k      0  0:02:12  0:00:06  0:02:06 2457k
  5  253M    5 13.5M    0     0  1726k      0  0:02:30  0:00:08  0:02:22 1918k
  6  253M    6 15.4M    0     0  1755k      0  0:02:28  0:00:08  0:02:20 1728k
  7  253M    7 18.1M    0     0  1856k      0  0:02

In [None]:
! yes y | gunzip urdu-tagged-corpus.gz

In [30]:


def parse_tag_file(filepath,max_lines=1000000, translate=True,output_file="translated.txt"):
    sentences = []
    translated_sentences = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in tqdm(file,total=max_lines):
            line = line.strip()
            if not line:
                continue  # skip empty lines
            tokens = []
            translated_sent = ""
            for item in line.split():
                if '|' in item:
                    word, tag = item.rsplit('|', 1)
                    tokens.append((word, tag))
                    if translate:
                      # t_word = pipe(word,src_lang="urd_Arab",tgt_lang="eng_Latn")[0].get('translation_text')
                        translated_sent = translated_sent + f" {word}"
            if tokens:
                sentences.append(tokens)
                if translate:
                  # print(translated_sent)
                    translated_sentences.append(
                      pipe(
                          translated_sent,src_lang="urd_Arab",tgt_lang="eng_Latn"
                      )[0].get('translation_text') +"  :: "+ translated_sent
                  )
            
            if len(sentences)==max_lines:
                break
          
            with open(output_file, "a", encoding="utf-8") as f_out:
                f_out.write(translated_sentences[-1] + "\n")
        
    return sentences


In [None]:
# filepath = "urmono.tag" ## in windows
filepath = 'urdu-tagged-corpus' ## in linux


sentences = parse_tag_file(filepath, 
                           max_lines=10000000, 
                           translate=True
                          )

# Example
print(f"Total sentences: {len(sentences)}")
print("First sentence:", sentences[0])


# Split dataset

In [None]:
import random

random.seed(42)
random.shuffle(sentences)
split_idx = int(0.8 * len(sentences))
train_sents = sentences[:split_idx]
test_sents = sentences[split_idx:]


# HMM model

In [6]:
import nltk
from nltk.tag.hmm import HiddenMarkovModelTrainer
import dill

## train

In [None]:


trainer = HiddenMarkovModelTrainer()
tagger = trainer.train_supervised(train_sents)

# Evaluate
accuracy = tagger.accuracy(test_sents)
print(f"Accuracy: {accuracy:.2%}")


In [None]:


with open("urdu_hmm_tagger.pkl", "wb") as f:
    dill.dump(tagger, f)


## load saved model

In [7]:

with open("urdu_hmm_tagger.pkl", "rb") as f:
    tagger = dill.load(f)

    

# Inference

In [8]:
urdu_sentence = ["آج", "موسم", "اچھا", "ہے"]
tagged = tagger.tag(urdu_sentence)
print("Tagged:", tagged)


Tagged: [('آج', 'NN'), ('موسم', 'NN'), ('اچھا', 'ADJ'), ('ہے', 'VB')]
