In [1]:
import numpy as np
import pandas as pd
import cv2
import os
from scipy.fftpack import fft

# Function to compute FFT features from an image window
def compute_fft_features(window, num_features=50):
    # Convert to grayscale if the window is not already
    if len(window.shape) == 3:
        window = cv2.cvtColor(window, cv2.COLOR_BGR2GRAY)
    
    # Flatten the window to a 1D array for FFT computation
    window_flat = window.flatten()
    
    # Compute FFT and take the first `num_features` magnitudes
    fft_features = np.abs(fft(window_flat))[:num_features]
    
    return fft_features

# Load data from the Excel file
labels_df = pd.read_excel(r"C:\Users\pavan\Downloads\gt_WIndow (2) 5.xlsx")  # Adjust the path if necessary

# Parameters
window_width = 30  # Width of each sliding window in pixels
step_size = 30     # Step size of the sliding window in pixels
image_folder = r"C:\Users\pavan\Downloads\color_window_double1 (1) 10\color_window_double1 (1)\color_window_double1"# Adjust the path if necessary

# Dictionary to store FFT feature sequences for each character across all images
character_fft_sequences = {}

# Process each image (word) in the dataset
for index, row in labels_df.iterrows():
    image_name = row['image name']         # Adjust column name if necessary
    character_sequence = row['Label']      # Adjust column name if necessary
    
    # Load the corresponding image
    image_path = os.path.join(image_folder, image_name)
    image = cv2.imread(image_path)
    if image is None:
        print(f"Image {image_name} could not be loaded.")
        continue

    image_width = image.shape[1]  # Get image width
    
    # Calculate the width of each character region based on the sequence length
    num_characters = len(character_sequence)
    character_width = image_width // num_characters
    
    # Loop through each character in the sequence and collect its FFT features
    for i, char in enumerate(character_sequence):
        # Define the region corresponding to the current character
        region_start = i * character_width
        region_end = region_start + character_width
        character_region = image[:, region_start:region_end]  # Assume height is all rows
        
        # Split the character region into windows to capture FFT features
        num_windows = (character_width - window_width) // step_size + 1
        char_fft_sequence = []
        
        for j in range(num_windows):
            # Calculate the start and end of the window within the character region
            window_start = region_start + j * step_size
            window_end = window_start + window_width
            
            # Extract the window
            window = image[:, window_start:window_end]
            
            # Compute FFT features for this window
            fft_features = compute_fft_features(window)
            char_fft_sequence.append(fft_features)
        
        # Append this character's FFT features to the global dictionary
        if char not in character_fft_sequences:
            character_fft_sequences[char] = []
        character_fft_sequences[char].append(char_fft_sequence)

# Print the number of FFT feature sequences for each character
for char, sequences in character_fft_sequences.items():
    print(f"Character '{char}' has {len(sequences)} sequences of FFT features.")

Character 'െ' has 618 sequences of FFT features.
Character 'ത' has 535 sequences of FFT features.
Character 'അ' has 103 sequences of FFT features.
Character '്' has 1202 sequences of FFT features.
Character 'ാ' has 993 sequences of FFT features.
Character 'ി' has 729 sequences of FFT features.
Character 'പ' has 339 sequences of FFT features.
Character 'റ' has 432 sequences of FFT features.
Character 'ര' has 681 sequences of FFT features.
Character 'ു' has 455 sequences of FFT features.
Character 'ള' has 78 sequences of FFT features.
Character 'ക' has 473 sequences of FFT features.
Character 'ഷ' has 105 sequences of FFT features.
Character 'മ' has 361 sequences of FFT features.
Character 'ആ' has 28 sequences of FFT features.
Character 'ഠ' has 61 sequences of FFT features.
Character 'ൽ' has 40 sequences of FFT features.
Character 'ർ' has 62 sequences of FFT features.
Character 'ഇ' has 30 sequences of FFT features.
Character 'ങ' has 37 sequences of FFT features.
Character 'ന' has 674 sequ

In [2]:

import numpy as np
from hmmlearn import hmm

# Dictionary to store character HMMs
character_hmms = {}
num_states=4
# Example: 'character_dct_sequences' contains the DCT feature sequences for each character
for char, sequences in character_fft_sequences.items():
    # Remove empty sequences
    sequences = [seq for seq in sequences if len(seq) > 0]
    
    if len(sequences) == 0:
        print(f"Warning: No valid sequences for character {char}. Skipping this character.")
        continue  # Skip to the next character if no valid sequences
    if len(sequences) >=15:
    # Prepare training data for the HMM
        X = np.vstack(sequences)  # Stack the sequences into a single array
        lengths = [len(seq) for seq in sequences]  # Length of each sequence
    
        # Initialize HMM for this character
        model = hmm.GaussianHMM(n_components=num_states, covariance_type="diag", n_iter=1000)
    
        # Train the HMM with the character’s DCT feature sequences
        model.fit(X, lengths)
    
        # Store the trained model
        character_hmms[char] = model
        print(f"Model trained for character: {char}")


Model is not converging.  Current: -1076975.619582473 is not greater than -1076975.4574898942. Delta is -0.16209257883019745


Model trained for character: െ


Model is not converging.  Current: -559148.298698807 is not greater than -559148.1182002898. Delta is -0.1804985172348097


Model trained for character: ത
Model trained for character: അ


Model is not converging.  Current: -861102.9126763333 is not greater than -861100.4020951531. Delta is -2.51058118022047


Model trained for character: ്
Model trained for character: ാ
Model trained for character: ി


Model is not converging.  Current: -391609.8434391378 is not greater than -391606.7785456187. Delta is -3.064893519098405


Model trained for character: പ


Model is not converging.  Current: -512129.2845176977 is not greater than -512129.1921064182. Delta is -0.09241127950372174


Model trained for character: റ
Model trained for character: ര
Model trained for character: ു
Model trained for character: ള


Model is not converging.  Current: -542184.9220411909 is not greater than -542183.1629330177. Delta is -1.7591081731952727


Model trained for character: ക


Model is not converging.  Current: -99771.95543928485 is not greater than -99771.81096386103. Delta is -0.14447542381822132


Model trained for character: ഷ
Model trained for character: മ


Model is not converging.  Current: -72355.00585443541 is not greater than -72355.00566824073. Delta is -0.00018619468028191477


Model trained for character: ആ
Model trained for character: ഠ
Model trained for character: ൽ


Model is not converging.  Current: -72922.07288578589 is not greater than -72922.0648620497. Delta is -0.008023736198083498


Model trained for character: ർ
Model trained for character: ഇ
Model trained for character: ങ
Model trained for character: ന
Model trained for character: ജ
Model trained for character: ട


Model is not converging.  Current: -361413.90383511543 is not greater than -361413.630147263. Delta is -0.27368785243015736


Model trained for character: വ


Model is not converging.  Current: -86068.29688656218 is not greater than -86067.0401561372. Delta is -1.2567304249823792


Model trained for character: ഹ
Model trained for character: ം


Model is not converging.  Current: -399929.4991726757 is not greater than -399927.7090850245. Delta is -1.7900876512285322


Model trained for character: യ


Model is not converging.  Current: -1009072.0596383412 is not greater than -1009071.7853262965. Delta is -0.2743120447266847


Model trained for character: "
Model trained for character: ധ


Model is not converging.  Current: -230806.3519049973 is not greater than -230804.5707758361. Delta is -1.7811291612160858


Model trained for character: ച
Model trained for character: എ


Model is not converging.  Current: -328152.0026404981 is not greater than -328149.4733263207. Delta is -2.5293141774018295


Model trained for character: ല


Model is not converging.  Current: -78968.50074889098 is not greater than -78968.2545993671. Delta is -0.24614952388219535


Model trained for character: ഴ
Model trained for character: ദ


Model is not converging.  Current: -107630.18448015887 is not greater than -107629.85826524875. Delta is -0.3262149101210525


Model trained for character: ഗ
Model trained for character: ഉ


Model is not converging.  Current: -169347.2588292024 is not greater than -169344.80298518113. Delta is -2.4558440212567803


Model trained for character: ണ


Model is not converging.  Current: -83472.67579523563 is not greater than -83472.58244868017. Delta is -0.09334655545535497


Model trained for character: ൂ
Model trained for character: ൻ


Model is not converging.  Current: -157932.66359520945 is not greater than -157932.57595766056. Delta is -0.08763754888786934


Model trained for character: സ


Model is not converging.  Current: -105969.20518315141 is not greater than -105968.92335504029. Delta is -0.28182811112492345


Model trained for character: ീ


Model is not converging.  Current: -85608.86495182695 is not greater than -85608.86096661972. Delta is -0.003985207236837596


Model trained for character: ഭ
Model trained for character: ഖ


Model is not converging.  Current: -109911.85375501106 is not greater than -109910.41530664629. Delta is -1.4384483647736488


Model trained for character: ശ


Model is not converging.  Current: -95631.57083801953 is not greater than -95631.30483165686. Delta is -0.26600636266812216
Model is not converging.  Current: -28819.232477388134 is not greater than -28818.246883360334. Delta is -0.9855940277993795


Model trained for character: ൈ


Model is not converging.  Current: -38187.13831142917 is not greater than -38186.289992938706. Delta is -0.8483184904616792


Model trained for character: ഥ
Model trained for character: ൾ
Model trained for character: ഘ
Model trained for character: ൃ


Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.
Some rows of transmat_ have zero sum because no transition from the state was ever observed.


Model trained for character: ൊ


Model is not converging.  Current: -24178.34702140592 is not greater than -24178.1700038745. Delta is -0.17701753141955123


Model trained for character: ഞ
Model trained for character: ബ


Some rows of transmat_ have zero sum because no transition from the state was ever observed.
  a -= a_lse
Model is not converging.  Current: -inf is not greater than -13937.112986046941. Delta is -inf


Model trained for character: ഃ


In [5]:
import re
import joblib
 
# Assuming your models are in a dictionary called `character_hmms`
# e.g., `character_hmms = {'െ': trained_model_1, 'ത': trained_model_2, ... }`
 
def sanitize_filename(char):
    # Replace invalid filename characters with an underscore or remove them
    return re.sub(r'[<>:"/\\|?*]', '_', char)
 
for char, model in character_hmms.items():
    # Sanitize character to create a valid filename
    sanitized_char = sanitize_filename(char)
    print(char,sanitized_char )
    # Save the trained HMM model to a file with the sanitized name
    joblib.dump(model, f"{sanitized_char}_hmm.pkl")

െ െ
ത ത
അ അ
് ്
ാ ാ
ി ി
പ പ
റ റ
ര ര
ു ു
ള ള
ക ക
ഷ ഷ
മ മ
ആ ആ
ഠ ഠ
ൽ ൽ
ർ ർ
ഇ ഇ
ങ ങ
ന ന
ജ ജ
ട ട
വ വ
ഹ ഹ
ം ം
യ യ
" _
ധ ധ
ച ച
എ എ
ല ല
ഴ ഴ
ദ ദ
ഗ ഗ
ഉ ഉ
ണ ണ
ൂ ൂ
ൻ ൻ
സ സ
ീ ീ
ഭ ഭ
ഖ ഖ
ശ ശ
ൈ ൈ
ഥ ഥ
ൾ ൾ
ഘ ഘ
ൃ ൃ
ൊ ൊ
ഞ ഞ
ബ ബ
ഃ ഃ


In [6]:
# Extract the character names (keys) from the character_hmms dictionary
char_names = list(character_hmms.keys())

# Print the character names
print("Character names:", char_names)


Character names: ['െ', 'ത', 'അ', '്', 'ാ', 'ി', 'പ', 'റ', 'ര', 'ു', 'ള', 'ക', 'ഷ', 'മ', 'ആ', 'ഠ', 'ൽ', 'ർ', 'ഇ', 'ങ', 'ന', 'ജ', 'ട', 'വ', 'ഹ', 'ം', 'യ', '"', 'ധ', 'ച', 'എ', 'ല', 'ഴ', 'ദ', 'ഗ', 'ഉ', 'ണ', 'ൂ', 'ൻ', 'സ', 'ീ', 'ഭ', 'ഖ', 'ശ', 'ൈ', 'ഥ', 'ൾ', 'ഘ', 'ൃ', 'ൊ', 'ഞ', 'ബ', 'ഃ']


In [7]:
ground_truth =char_names

In [8]:
print("Ground truth list:", ground_truth)

Ground truth list: ['െ', 'ത', 'അ', '്', 'ാ', 'ി', 'പ', 'റ', 'ര', 'ു', 'ള', 'ക', 'ഷ', 'മ', 'ആ', 'ഠ', 'ൽ', 'ർ', 'ഇ', 'ങ', 'ന', 'ജ', 'ട', 'വ', 'ഹ', 'ം', 'യ', '"', 'ധ', 'ച', 'എ', 'ല', 'ഴ', 'ദ', 'ഗ', 'ഉ', 'ണ', 'ൂ', 'ൻ', 'സ', 'ീ', 'ഭ', 'ഖ', 'ശ', 'ൈ', 'ഥ', 'ൾ', 'ഘ', 'ൃ', 'ൊ', 'ഞ', 'ബ', 'ഃ']


In [9]:
unique_chars = sorted(set("".join(ground_truth)))
char_to_state = {char: idx for idx, char in enumerate(unique_chars)}
state_to_char = {idx: char for char, idx in char_to_state.items()}

In [10]:
print(state_to_char)

{0: '"', 1: 'ം', 2: 'ഃ', 3: 'അ', 4: 'ആ', 5: 'ഇ', 6: 'ഉ', 7: 'എ', 8: 'ക', 9: 'ഖ', 10: 'ഗ', 11: 'ഘ', 12: 'ങ', 13: 'ച', 14: 'ജ', 15: 'ഞ', 16: 'ട', 17: 'ഠ', 18: 'ണ', 19: 'ത', 20: 'ഥ', 21: 'ദ', 22: 'ധ', 23: 'ന', 24: 'പ', 25: 'ബ', 26: 'ഭ', 27: 'മ', 28: 'യ', 29: 'ര', 30: 'റ', 31: 'ല', 32: 'ള', 33: 'ഴ', 34: 'വ', 35: 'ശ', 36: 'ഷ', 37: 'സ', 38: 'ഹ', 39: 'ാ', 40: 'ി', 41: 'ീ', 42: 'ു', 43: 'ൂ', 44: 'ൃ', 45: 'െ', 46: 'ൈ', 47: 'ൊ', 48: '്', 49: 'ൻ', 50: 'ർ', 51: 'ൽ', 52: 'ൾ'}


In [11]:
from nltk.translate.bleu_score import sentence_bleu
import numpy as np
 
# Calculate BLEU score for predictions
for char, model in character_hmms.items():
    try:
        sequences = character_fft_sequences[char]
        # Remove empty sequences
        sequences = [seq for seq in sequences if len(seq) > 0]
        # Check if we have any sequences left
        if len(sequences) >= 15:
            X = np.vstack(sequences)
            lengths = [len(seq) for seq in sequences]
            # Check for valid transition matrix
            row_sums = model.transmat_.sum(axis=1)
            if not np.allclose(row_sums, 1):
                print(f"Problem with transition matrix for character '{char}': row sums = {row_sums}")
                continue  # Skip this model and move to the next one
            # Get predicted states
            predicted_states = model.predict(X)
            mapped_predictions = [state_to_char[state] for state in predicted_states]
            # Calculate BLEU score
            bleu_score = sentence_bleu([ground_truth], mapped_predictions)
            print(f"BLEU Score for character '{char}': {bleu_score}")
    except ValueError as e:
        print(f"Error with character '{char}': {e}")

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

BLEU Score for character 'െ': 2.591358246128121e-232
BLEU Score for character 'ത': 3.0566343182720105e-232
BLEU Score for character 'അ': 3.848862482290111e-232
BLEU Score for character '്': 2.7497163428860544e-232
BLEU Score for character 'ാ': 2.5358570520686987e-232
BLEU Score for character 'ി': 2.8520344303071392e-232
BLEU Score for character 'പ': 3.3468380251266696e-232


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

BLEU Score for character 'റ': 3.1156932668474443e-232
BLEU Score for character 'ര': 2.8456468550938538e-232
BLEU Score for character 'ു': 3.277234217472267e-232
BLEU Score for character 'ള': 4.619069342947944e-232
BLEU Score for character 'ക': 3.0788299741682e-232
BLEU Score for character 'ഷ': 4.7091911478183385e-232
BLEU Score for character 'മ': 3.287610814479087e-232
BLEU Score for character 'ആ': 5.0603552106552585e-232
BLEU Score for character 'ഠ': 4.68839950919502e-232
BLEU Score for character 'ൽ': 5.1385909989173886e-232
BLEU Score for character 'ർ': 5.067913641293489e-232


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

BLEU Score for character 'ഇ': 5.385059418481704e-232
BLEU Score for character 'ങ': 6.017810420952586e-232
BLEU Score for character 'ന': 2.8942324630145362e-232
BLEU Score for character 'ജ': 4.096982738799751e-232
BLEU Score for character 'ട': 3.475534353923017e-232
BLEU Score for character 'വ': 3.40763295341997e-232
BLEU Score for character 'ഹ': 4.881551675691592e-232
BLEU Score for character 'ം': 3.8431335716238256e-232
BLEU Score for character 'യ': 3.318844170077594e-232
BLEU Score for character '"': 2.5537794183414283e-232


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

BLEU Score for character 'ധ': 4.668058901567373e-232
BLEU Score for character 'ച': 3.815113718414687e-232
BLEU Score for character 'എ': 4.791502863533838e-232
BLEU Score for character 'ല': 3.498939947989373e-232
BLEU Score for character 'ഴ': 4.966937324201032e-232
BLEU Score for character 'ദ': 3.9892018274641066e-232
BLEU Score for character 'ഗ': 4.623853470895728e-232
BLEU Score for character 'ഉ': 5.775632145762076e-232
BLEU Score for character 'ണ': 4.1075284701416626e-232
BLEU Score for character 'ൂ': 4.913521633378757e-232
BLEU Score for character 'ൻ': 4.7091911478183385e-232
BLEU Score for character 'സ': 4.194168015960872e-232
BLEU Score for character 'ീ': 4.6334965961998555e-232
BLEU Score for character 'ഭ': 4.907043673832013e-232
BLEU Score for character 'ഖ': 6.124096286752193e-232
BLEU Score for character 'ശ': 4.590874316724141e-232
BLEU Score for character 'ൈ': 4.763266496573224e-232
BLEU Score for character 'ഥ': 6.3917876705550364e-232
BLEU Score for character 'ൾ': 5.948228664

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

In [15]:
# List of characters to remove from the character_hmms dictionary
characters_to_remove = ['ഞ','ഃ']
 
# Remove characters if they exist in the dictionary
for char in characters_to_remove:
    print (char)
    if char in character_hmms:
        del character_hmms[char]
        print(f"Deleted HMM model for character: {char}")
    else:
        print(f"Character {char} not found in character_hmms.")

ഞ
Character ഞ not found in character_hmms.
ഃ
Deleted HMM model for character: ഃ


In [16]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu
import numpy as np

# List to store results
results = []

# Loop through each character and its model
for char, model in character_hmms.items():
    sequences = character_fft_sequences[char]
    
    # Remove empty sequences
    sequences = [seq for seq in sequences if len(seq) > 0]
    
    # Check if we have enough sequences to proceed
    if len(sequences) >= 15:
        X = np.vstack(sequences)
        lengths = [len(seq) for seq in sequences]
        
        # Get predicted states and map to characters
        predicted_states = model.predict(X)
        mapped_predictions = [state_to_char[state] for state in predicted_states]
        
        # Calculate BLEU score
        bleu_score = sentence_bleu([ground_truth], mapped_predictions)
        
        # Store the results
        results.append({
            'Character': char,
            'Mapped Predictions': ''.join(mapped_predictions),
            'BLEU Score': bleu_score
        })

# Create a DataFrame to store results
results_df = pd.DataFrame(results)

# Save the results to an Excel file
output_path = r'C:\Users\pavan\Downloads\line_gt_7_dct.xlsx'
results_df.to_excel(output_path, index=False)

print(f"Results saved to {output_path}")


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

Results saved to C:\Users\pavan\Downloads\line_gt_7_dct.xlsx


In [17]:
for char, model in character_hmms.items():
    print(char,model)

െ GaussianHMM(n_components=4, n_iter=1000)
ത GaussianHMM(n_components=4, n_iter=1000)
അ GaussianHMM(n_components=4, n_iter=1000)
് GaussianHMM(n_components=4, n_iter=1000)
ാ GaussianHMM(n_components=4, n_iter=1000)
ി GaussianHMM(n_components=4, n_iter=1000)
പ GaussianHMM(n_components=4, n_iter=1000)
റ GaussianHMM(n_components=4, n_iter=1000)
ര GaussianHMM(n_components=4, n_iter=1000)
ു GaussianHMM(n_components=4, n_iter=1000)
ള GaussianHMM(n_components=4, n_iter=1000)
ക GaussianHMM(n_components=4, n_iter=1000)
ഷ GaussianHMM(n_components=4, n_iter=1000)
മ GaussianHMM(n_components=4, n_iter=1000)
ആ GaussianHMM(n_components=4, n_iter=1000)
ഠ GaussianHMM(n_components=4, n_iter=1000)
ൽ GaussianHMM(n_components=4, n_iter=1000)
ർ GaussianHMM(n_components=4, n_iter=1000)
ഇ GaussianHMM(n_components=4, n_iter=1000)
ങ GaussianHMM(n_components=4, n_iter=1000)
ന GaussianHMM(n_components=4, n_iter=1000)
ജ GaussianHMM(n_components=4, n_iter=1000)
ട GaussianHMM(n_components=4, n_iter=1000)
വ GaussianH

In [18]:
import joblib
import re

# Assuming your models are in a dictionary called `character_hmms`
# e.g., `character_hmms = {'െ': trained_model_1, 'ത': trained_model_2, ... }`

def sanitize_filename(char):
    # Replace invalid filename characters with an underscore or remove them
    return re.sub(r'[<>:"/\\|?*]', '_', char)

for char, model in character_hmms.items():
    # Sanitize character to create a valid filename
    sanitized_char = sanitize_filename(char)
    print(char,sanitized_char )
    # Save the trained HMM model to a file with the sanitized name
    joblib.dump(model, f"{sanitized_char}_hmm.pkl")

െ െ
ത ത
അ അ
് ്
ാ ാ
ി ി
പ പ
റ റ
ര ര
ു ു
ള ള
ക ക
ഷ ഷ
മ മ
ആ ആ
ഠ ഠ
ൽ ൽ
ർ ർ
ഇ ഇ
ങ ങ
ന ന
ജ ജ
ട ട
വ വ
ഹ ഹ
ം ം
യ യ
" _
ധ ധ
ച ച
എ എ
ല ല
ഴ ഴ
ദ ദ
ഗ ഗ
ഉ ഉ
ണ ണ
ൂ ൂ
ൻ ൻ
സ സ
ീ ീ
ഭ ഭ
ഖ ഖ
ശ ശ
ൈ ൈ
ഥ ഥ
ൾ ൾ
ഘ ഘ
ൃ ൃ
ൊ ൊ
ബ ബ


In [19]:
results_df 

Unnamed: 0,Character,Mapped Predictions,BLEU Score
0,െ,ംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംം...,2.591358e-232
1,ത,"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""...",3.056634e-232
2,അ,ംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംം...,3.848862e-232
3,്,ഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃ...,2.7497159999999997e-232
4,ാ,അഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅഅ...,2.535857e-232
5,ി,ംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംം...,2.8520339999999998e-232
6,പ,ഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃഃ...,3.346838e-232
7,റ,"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""...",3.115693e-232
8,ര,ംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംം...,2.845647e-232
9,ു,ംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംംം...,3.277234e-232


In [20]:
import pandas as pd
import numpy as np

# Read line sequences from Excel

# Assuming the line sequences are in a column named 'line_sequence' or similar
line_sequences = labels_df['Label'].tolist()

# Example line sequence (you can iterate through all lines in your dataset)
# line_sequence = ['ര', 'വ', 'ി', 'ട', 'ു']  # Example line sequence


In [21]:
# Get indices for each individual character, but skip characters not in char_to_index
line_sequence_indices = [char_to_state [individual_char] 
                         for char_group in line_sequences 
                         for individual_char in char_group 
                         if individual_char in char_to_state ]
print("Line sequence indices:", line_sequence_indices)

Line sequence indices: [45, 19, 45, 3, 19, 48, 19, 19, 39, 40, 24, 24, 30, 29, 42, 29, 32, 39, 32, 8, 45, 45, 36, 27, 39, 40, 19, 48, 19, 40, 19, 48, 19, 40, 4, 19, 48, 29, 45, 45, 17, 8, 8, 39, 51, 51, 50, 5, 5, 5, 45, 39, 45, 8, 45, 12, 48, 8, 12, 48, 8, 39, 45, 45, 23, 45, 23, 30, 45, 45, 14, 45, 8, 30, 8, 16, 48, 29, 16, 48, 29, 40, 16, 48, 29, 40, 16, 16, 48, 16, 42, 16, 48, 16, 42, 17, 42, 17, 17, 34, 38, 38, 30, 40, 29, 40, 29, 40, 45, 19, 45, 19, 19, 1, 45, 23, 45, 23, 48, 23, 23, 1, 28, 28, 28, 45, 28, 28, 39, 40, 39, 39, 0, 0, 24, 24, 8, 4, 4, 22, 40, 13, 48, 13, 40, 13, 48, 13, 13, 48, 13, 13, 40, 45, 30, 45, 19, 19, 23, 48, 23, 48, 23, 40, 23, 48, 23, 40, 30, 40, 16, 48, 16, 42, 16, 48, 16, 42, 30, 3, 3, 32, 45, 32, 5, 7, 7, 31, 48, 31, 31, 48, 31, 5, 39, 39, 45, 30, 45, 19, 45, 19, 30, 45, 45, 8, 45, 39, 39, 7, 7, 16, 13, 48, 13, 13, 48, 13, 42, 24, 48, 24, 42, 3, 19, 19, 23, 23, 48, 23, 42, 23, 48, 23, 42, 29, 45, 45, 45, 0, 0, 0, 0, 0, 0, 0, 0, 16, 16, 8, 8, 42, 8, 42, 2

In [22]:
import joblib
from hmmlearn import hmm
import cv2
# Load character HMM models
character_hmms = {}
for char in char_to_state.keys():
    sanitized_char = sanitize_filename(char)
    try:
        model = joblib.load(f"{sanitized_char}_hmm.pkl")
        character_hmms[char] = model
    except FileNotFoundError:
        print(f"Model for character '{char}' not found.")
        continue

# Function to predict line sequence based on HMMs
def predict_line_sequence(line_image, window_width=30, step_size=30):
    # Divide the line image into character-width segments
    predictions = []
    image_width = line_image.shape[1]
    num_windows = (image_width - window_width) // step_size + 1
    
    # Loop through each window and predict the character
    for i in range(num_windows):
        window_start = i * step_size
        window_end = window_start + window_width
        window = line_image[:, window_start:window_end]
        
        # Compute DCT features for the window
        fft_features = compute_fft_features(window)
        fft_features = fft_features.reshape(1, -1)  # Reshape for HMM input
        
        # Calculate likelihoods for each character model
        best_char = None
        best_score = float('-inf')
        
        for char, model in character_hmms.items():
            try:
                score = model.score(fft_features)
                if score > best_score:
                    best_score = score
                    best_char = char
            except:
                pass  # Ignore errors for invalid model scoring
        
        if best_char is not None:
            predictions.append(best_char)
    
    return ''.join(predictions)

# Load a test line image
test_image_path = r"C:\Users\pavan\Downloads\color_window_double1 (1) 10\color_window_double1 (1)\color_window_double1\mal286_010_9.jpg_window_0.jpg"
test_image = cv2.imread(test_image_path, cv2.IMREAD_GRAYSCALE)

# Predict the character sequence in the test line image
predicted_sequence = predict_line_sequence(test_image)
print("Predicted line sequence:", predicted_sequence)

Predicted line sequence: ചതിിെയചമടി


In [23]:
import joblib
from hmmlearn import hmm
import cv2
import numpy as np
import pandas as pd

# Load character HMM models
character_hmms = {}
for char in char_to_state.keys():
    sanitized_char = sanitize_filename(char)
    try:
        model = joblib.load(f"{sanitized_char}_hmm.pkl")
        character_hmms[char] = model
    except FileNotFoundError:
        print(f"Model for character '{char}' not found.")
        continue

# Function to compute the actual sequence based on window mapping
# Function to extract the actual sequence for a specific image from the DataFrame
def get_actual_sequence_from_df(image_name, line_image, label_df, window_width=30, step_size=30):
    # Find the corresponding row in the DataFrame
    row = label_df[label_df['image name'] == image_name]
    
    if row.empty:
        raise ValueError(f"Image name '{image_name}' not found in the labels DataFrame.")
    
    # Extract the ground truth character sequence
    character_sequence = row['Label'].values[0]  # Adjust column name if necessary
    
    # Compute the actual sequence based on ground truth and image dimensions
    actual_sequence = []
    image_width = line_image.shape[1]
    num_characters = len(character_sequence)
    character_width = image_width // num_characters

    for i, char in enumerate(character_sequence):
        # Define the region corresponding to this character
        region_start = i * character_width
        region_end = region_start + character_width
        character_region = line_image[:, region_start:region_end]

        # Divide the character region into windows
        num_windows = (character_width - window_width) // step_size + 1
        for _ in range(num_windows):
            actual_sequence.append(char)  # Map each window to the current character
    
    return actual_sequence
# Function to predict the sequence based on HMM models
def predict_line_sequence(line_image, window_width=30, step_size=10):
    predictions = []
    image_width = line_image.shape[1]
    num_windows = (image_width - window_width) // step_size + 1

    for i in range(num_windows):
        window_start = i * step_size
        window_end = window_start + window_width
        window = line_image[:, window_start:window_end]

        # Compute DCT features for the window
        fft_features = compute_fft_features(window)
        fft_features = fft_features.reshape(1, -1)

        # Calculate likelihoods for each character model
        best_char = None
        best_score = float('-inf')

        for char, model in character_hmms.items():
            try:
                score = model.score(fft_features)
                if score > best_score:
                    best_score = score
                    best_char = char
            except:
                pass  # Ignore errors for invalid model scoring

        if best_char is not None:
            predictions.append(best_char)

    return ''.join(predictions)

# Test image details
# Load the labels DataFrame
label_file = r"C:\Users\pavan\Downloads\gt_WIndow (2) 5.xlsx"
label_df = pd.read_excel(label_file)

# Test image details
test_image_path = r"C:\Users\pavan\Downloads\color_window_double1 (1) 10\color_window_double1 (1)\color_window_double1\MaI12_Page100_line_1.jpg_window_0.jpg"
test_image_name = test_image_path.split("\\")[-1]  # Extract the image name

# Load the test image
test_image = cv2.imread(test_image_path, cv2.IMREAD_GRAYSCALE)

# Extract the actual sequence from the DataFrame
actual_sequence = get_actual_sequence_from_df(test_image_name, test_image, label_df)

# Predict the sequence using the HMM models
predicted_sequence = predict_line_sequence(test_image)

# Print the sequences for comparison
print(f"Actual sequence: {''.join(actual_sequence)}")
print(f"Predicted sequence: {predicted_sequence}")


Actual sequence: െെെെെെെെെെ
Predicted sequence: അഅ"അധർ"ർർർഎഎഎയഅഅെഴജമമമആആമധഎഎഎ"


In [24]:
from nltk.translate.bleu_score import sentence_bleu

# Function to calculate BLEU score
def calculate_bleu_score(actual_sequence, predicted_sequence):
    # BLEU expects the reference as a list of lists and hypothesis as a list
    reference = [list(actual_sequence)]  # Wrap in another list for multiple references
    hypothesis = list(predicted_sequence)
    bleu_score = sentence_bleu(reference, hypothesis)
    return bleu_score

# Load the labels DataFrame
label_file = r"C:\Users\pavan\Downloads\gt_WIndow (2) 5.xlsx"
label_df = pd.read_excel(label_file)

# Test image details
test_image_path =r"C:\Users\pavan\Downloads\color_window_double1 (1) 10\color_window_double1 (1)\color_window_double1\MaI12_Page100_line_1.jpg_window_0.jpg"
test_image_name = test_image_path.split("\\")[-1]  # Extract the image name

# Load the test image
test_image = cv2.imread(test_image_path, cv2.IMREAD_GRAYSCALE)

# Extract the actual sequence from the DataFrame
actual_sequence = get_actual_sequence_from_df(test_image_name, test_image, label_df)

# Predict the sequence using the HMM models
predicted_sequence = predict_line_sequence(test_image)

# Calculate BLEU score
bleu_score = calculate_bleu_score(actual_sequence, predicted_sequence)

# Print the results
print(f"Actual sequence: {''.join(actual_sequence)}")
print(f"Predicted sequence: {predicted_sequence}")
print(f"BLEU Score: {bleu_score}")


Actual sequence: െെെെെെെെെെ
Predicted sequence: അഅ"അധർ"ർർർഎഎഎയഅഅെഴജമമമആആമധഎഎഎ"
BLEU Score: 7.784451369270533e-232


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [25]:
import os
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu

# Function to calculate BLEU score
def calculate_bleu_score(actual_sequence, predicted_sequence):
    reference = [list(actual_sequence)]  # Wrap in another list for multiple references
    hypothesis = list(predicted_sequence)
    bleu_score = sentence_bleu(reference, hypothesis)
    return bleu_score

# Folder paths and files
image_folder = r"C:\Users\pavan\Downloads\color_window_double1 (1) 10\color_window_double1 (1)\color_window_double1\"
label_file = r"C:\Users\pavan\Downloads\gt_WIndow (2) 5.xlsx"
output_file = r"C:\Users\pavan\Downloads\line_gt_7_dct.xlsx"

# Load the labels DataFrame
label_df = pd.read_excel(label_file)

# List all image files in the folder
image_files = [f for f in os.listdir(image_folder) if f.endswith(('.jpg', '.png'))]

# Initialize a results list
results = []

# Process each image
for image_name in image_files:
    image_path = os.path.join(image_folder, image_name)
    
    # Load the image
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if image is None:
        print(f"Error loading image: {image_name}")
        continue

    # Extract the actual sequence from the DataFrame
    actual_sequence = get_actual_sequence_from_df(image_name, image, label_df)
    if actual_sequence is None:
        print(f"No label found for image: {image_name}")
        continue

    # Predict the sequence using the HMM models
    predicted_sequence = predict_line_sequence(image)
    
    # Calculate BLEU score
    bleu_score = calculate_bleu_score(actual_sequence, predicted_sequence)
    
    # Append results
    results.append({
        "Image Name": image_name,
        "Actual Sequence": ''.join(actual_sequence),
        "Predicted Sequence": predicted_sequence,
        "BLEU Score": bleu_score
    })

# Save the results to an Excel sheet
results_df = pd.DataFrame(results)
results_df.to_excel(output_file, index=False)

print(f"Results saved to {output_file}")


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, indepe

ValueError: Image name 'maI16_01_10.jpg_window_0.jpg' not found in the labels DataFrame.