<a href="https://colab.research.google.com/github/urvashishr/oibsip_taskno1/blob/main/notebooks/method1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import zipfile
import os

zip_path = "/content/working.zip"   # path to your uploaded zip
extract_folder = "/content/working" # where to extract

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)

print("Extracted files to:", extract_folder)


Extracted files to: /content/working


In [6]:
# List files inside the extracted folder
for root, dirs, files in os.walk(extract_folder):
    for file in files:
        print(os.path.join(root, file))


/content/working/working/.DS_Store
/content/working/working/synthdata/.DS_Store
/content/working/working/synthdata/data.yaml
/content/working/working/synthdata/images/.DS_Store
/content/working/working/synthdata/images/val/val_0233.png
/content/working/working/synthdata/images/val/val_0703.png
/content/working/working/synthdata/images/val/val_0171.png
/content/working/working/synthdata/images/val/val_0625.png
/content/working/working/synthdata/images/val/val_0720.png
/content/working/working/synthdata/images/val/val_0503.png
/content/working/working/synthdata/images/val/val_0424.png
/content/working/working/synthdata/images/val/val_0299.png
/content/working/working/synthdata/images/val/val_0484.png
/content/working/working/synthdata/images/val/val_0364.png
/content/working/working/synthdata/images/val/val_0628.png
/content/working/working/synthdata/images/val/val_0279.png
/content/working/working/synthdata/images/val/val_0770.png
/content/working/working/synthdata/images/val/val_0050.p

In [100]:
!pip install easyocr nltk jellyfish editdistance langdetect

Collecting sympy>=1.13.3 (from torch->easyocr)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Using cached sympy-1.14.0-py3-none-any.whl (6.3 MB)
Installing collected packages: sympy
  Attempting uninstall: sympy
    Found existing installation: sympy 1.12
    Uninstalling sympy-1.12:
      Successfully uninstalled sympy-1.12
Successfully installed sympy-1.14.0


In [101]:
import easyocr
import nltk
import editdistance
import jellyfish
import re
from nltk.corpus import wordnet as wn, brown, stopwords
from nltk import word_tokenize, pos_tag
from langdetect import detect, DetectorFactory

DetectorFactory.seed = 0


In [102]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('brown')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [103]:
freq_dist = nltk.FreqDist(w.lower() for w in brown.words())
stop_words = set(stopwords.words("english"))


In [120]:
def soundex_code(word):
    try:
        return jellyfish.soundex(word)
    except:
        return ""

visual_confusions = {
    "qing": ["ring", "ping", "king", "zing"],
    "rn": ["m"],
    "sceen": ["screen", "scene"],
    "barkinq": ["barking"],
    "gud": ["good"],
    "0": ["o"],
    "1": ["l", "i"],
    "5": ["s"],
    "8": ["b"],
    "@": ["a"],
    "$": ["s"]
}

phonetic_confusions = {
    "qing": ["ring", "ping", "king", "zing"],
    "gud": ["good"]
}


In [121]:
def generate_candidates(word):
    candidates = set()
    w = word.lower()

    # Visual confusions
    for key, vals in visual_confusions.items():
        if key in w:
            for val in vals:
                candidates.add(w.replace(key, val))

    # Phonetic similarity
    for ph_word, ph_vals in phonetic_confusions.items():
        if soundex_code(w) == soundex_code(ph_word):
            candidates.update(ph_vals)

    candidates.add(w)
    return list(candidates)


In [122]:
def best_candidate(word, candidates, context_words, window=3, index=0):
    best_word = word
    best_score = -1
    w = word.lower()

    context = context_words[max(0, index-window): index] + context_words[index+1: index+1+window]

    for cand in candidates:
        score = 0
        score += max(0, 5 - editdistance.eval(w, cand))
        score += 0.01 * freq_dist[cand]

        for ctx in context:
            if ctx.lower() not in stop_words:
                syns1 = wn.synsets(cand)
                syns2 = wn.synsets(ctx)
                sim = syns1[0].wup_similarity(syns2[0]) if syns1 and syns2 else 0
                score += sim * 10

        if score > best_score:
            best_score = score
            best_word = cand

    return best_word


In [123]:
def filter_english_words(tokens):
    """Return only tokens that are likely English words (letters only)"""
    return [t for t in tokens if re.fullmatch(r"[A-Za-z]+", t)]

In [124]:
def correct_sentence(sentence):
    words = word_tokenize(sentence)
    tagged = pos_tag(words)
    corrected = []

    # Filter only alphabetic words for correction
    for i, (word, tag) in enumerate(tagged):
        if re.fullmatch(r"[A-Za-z]+", word):
            candidates = generate_candidates(word)
            context = [w for j, (w, _) in enumerate(tagged) if j != i]
            best = best_candidate(word, candidates, context, index=i)
            corrected.append(best)
        else:
            corrected.append(word)

    return " ".join(corrected)


In [125]:
# Number of images to process (for testing)
N = 5

# Get all images in your folder
import os
image_folder = "/content/working/working/synthdata/images/train"
image_files = [os.path.join(image_folder, f) for f in os.listdir(image_folder)
               if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

# Limit to first N images
image_files = image_files[:N]

# Initialize EasyOCR reader
import easyocr
reader = easyocr.Reader(['en'])

ocr_sentences = []

# OCR with progress print
for idx, img_path in enumerate(image_files, 1):
    result = reader.readtext(img_path, detail=0)
    ocr_sentences.extend(result)
    print(f"OCR done: {idx}/{len(image_files)} images")

print(f"Total OCR-extracted lines: {len(ocr_sentences)}")




OCR done: 1/5 images




OCR done: 2/5 images




OCR done: 3/5 images




OCR done: 4/5 images




OCR done: 5/5 images
Total OCR-extracted lines: 62


In [126]:
corrected_sentences = [correct_sentence(sent) for sent in ocr_sentences]

# Filter only English sentences
english_sentences = []
for sent in corrected_sentences:
    try:
        if detect(sent) == "en":
            english_sentences.append(sent)
    except:
        pass

print(f"Total corrected English sentences: {len(english_sentences)}")


Total corrected English sentences: 1


In [127]:
# Preview all corrected English sentences
if corrected_sentences:
    print("Corrected English outputs:\n")
    for i, sent in enumerate(corrected_sentences, 1):
        print(f"{i}. {sent}")
else:
    print("No meaningful English sentences were detected in OCR output.")


Corrected English outputs:

1. 6 1 b n
2. 9 p k
3. 3 p
4. e 9 9 b &
5. 3 f 97 l 7
6. 6 l
7. b 3 # k ( p 9 ^ p b e mkr a
8. 0 <
9. f 3p
10. m d
11. # j 4 ( €
12. 8 p w l
13. 0 `` xt h )
14. 0 m r € b 6 ]
15. n $ # d # n
16. 9 q
17. j r n 4 & ^ 5
18. p h j & n x
19. 0 b & > b > n
20. 1 ' u ] 4
21. la # [
22. l y fu
23. b < aa m <
24. p 0 &
25. ka / m6 2 /
26. 9 a ~
27. m n j l r e {
28. 0 f $ g
29. 1 6 / c
30. yn k
31. m # 9 ? /
32. 0 n 4 m $ a b
33. p j 0 0 n 1 r
34. ) _ 1 40 a t
35. 2
36. 2 &
37. 2 m j ml
38. 8 e € & k b
39. j # 8 7 &
40. 1 5 <
41. j 94 n 9 ~ 6
42. 1l # & > <
43. l ^ m k 0
44. m f 9 p
45. j ? n y
46. x m '41 <
47. 2 9 p ( <
48. mr $ & na ^
49. 94
50. 7 4 p ml a
51. cq / 0 € #
52. € 6 0
53. 9 # ni 3 k 0
54. ~198 j i
55. g }
56. b 6 & % l
57. 1 & md
58. 2 k q > 2 4
59. a k ) a 4 n 1 &
60. m a h a 9 l q
61. 8 > k # m
62. a e t
