In [139]:
import pandas as pd
import numpy as np
import pickle 
import os

In [140]:

# Step 1: Open the file in read-binary mode
with open('data.pkl', 'rb') as file:
    # Step 2: Load the data from the file
    data = pickle.load(file)

# Now, `data` contains the deserialized Python object
print(data)


0        JAN JAGRAN TIMES
1        JAGRAN CITY PLUS
2         SAMPURNA JAGRAN
3           DAINIK JAGRAN
4           VISHWA JAGRAN
               ...       
21394        KAIWART AWAZ
21395     SARBAHARAR AWAZ
21396      SHRAMIKER AWAZ
21397          SOBAR AWAZ
21398        AWAZ AAP TAK
Name: Title Name, Length: 10790, dtype: object


In [141]:
data = data.str.lower()
print(data)

0        jan jagran times
1        jagran city plus
2         sampurna jagran
3           dainik jagran
4           vishwa jagran
               ...       
21394        kaiwart awaz
21395     sarbaharar awaz
21396      shramiker awaz
21397          sobar awaz
21398        awaz aap tak
Name: Title Name, Length: 10790, dtype: object


In [142]:
data = pd.DataFrame(data)
data = data.drop_duplicates().dropna()

In [127]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Download stopwords if not already done
import nltk
nltk.download('stopwords')
nltk.download('punkt')

# Get English stopwords
stop_words = set(stopwords.words('english'))

# Function to clean text by removing stopwords and punctuation
def clean_text(text):
    if pd.isnull(text):
        return ""
    # Tokenize text into words
    tokens = word_tokenize(text)
    # Convert to lowercase and remove punctuation and stopwords
    cleaned = [
        word.lower() for word in tokens
        if word.lower() not in stop_words and word not in string.punctuation
    ]
    return "".join(cleaned)  # Return as a single string for n-gram generation

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Debanjan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Debanjan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [143]:
# Rename column
from metaphone import doublemetaphone

data.rename(columns={'Title Name': 'title'}, inplace=True)
data['metaphoneA'], data['metaphoneB'] = zip(*data['title'].apply(doublemetaphone))


In [144]:
data.to_csv('phonatics.csv', index=False)

In [130]:
# Example input query
query ="The Hindu Chronics"
query =   clean_text(query)
query_metaphoneA, query_metaphoneB = doublemetaphone(query)
print(query_metaphoneA, query_metaphoneB)

HNTKRNKS 


In [131]:
import pandas as pd
from rapidfuzz import fuzz  # Faster alternative to fuzzywuzzy


# Threshold for fuzzy match
similarity_threshold = 70

# Function to find matches for double metaphones
def find_double_metaphone_matches(input_metaphoneA, input_metaphoneB, data, threshold=80):
    matches = []
    for _, row in data.iterrows():
        # Calculate similarity scores for both metaphones
        score_A = fuzz.ratio(input_metaphoneA, row['metaphoneA']) if row['metaphoneA'] else 0
        score_B = fuzz.ratio(input_metaphoneB, row['metaphoneB']) if row['metaphoneB'] else 0
        
        # Check if either score exceeds the threshold
        if score_A >= threshold or score_B >= threshold:
            matches.append({
                "title": row['title'],
                "metaphoneA": row['metaphoneA'],
                "metaphoneB": row['metaphoneB'],
                "similarity_score_A": score_A,
                "similarity_score_B": score_B
            })
    
    return matches

# Find matches
matches = find_double_metaphone_matches(query_metaphoneA, query_metaphoneB, data, similarity_threshold)

# Sort results by the highest similarity score (from either metaphone column)
matches = sorted(matches, key=lambda x: max(x['similarity_score_A'], x['similarity_score_B']), reverse=True)

# Display results



In [133]:
print(matches)                          

[{'title': 'india grains', 'metaphoneA': 'ANTKRNS', 'metaphoneB': '', 'similarity_score_A': 80.0, 'similarity_score_B': 0}, {'title': 'india core news', 'metaphoneA': 'ANTKRNS', 'metaphoneB': '', 'similarity_score_A': 80.0, 'similarity_score_B': 0}, {'title': 'mahanagar ki awaz', 'metaphoneA': 'MHNKRKS', 'metaphoneB': 'MHNKRKTS', 'similarity_score_A': 80.0, 'similarity_score_B': 0.0}, {'title': 'mahanagar ki aawaz', 'metaphoneA': 'MHNKRKS', 'metaphoneB': 'MHNKRKTS', 'similarity_score_A': 80.0, 'similarity_score_B': 0.0}, {'title': 'haryana ki aawaz', 'metaphoneA': 'HRNKS', 'metaphoneB': 'HRNKTS', 'similarity_score_A': 76.92307692307692, 'similarity_score_B': 0.0}, {'title': 'narwana ki awaz', 'metaphoneA': 'NRNKS', 'metaphoneB': 'NRNKTS', 'similarity_score_A': 76.92307692307692, 'similarity_score_B': 0.0}, {'title': 'tarunai ki awaz', 'metaphoneA': 'TRNKS', 'metaphoneB': 'TRNKTS', 'similarity_score_A': 76.92307692307692, 'similarity_score_B': 0.0}, {'title': 'nagaur ki awaz', 'metaphon

In [134]:
import pandas as pd
from nltk.util import ngrams


# Generate n-grams for a given column
def generate_ngrams_range(column, n_range=(3,3)):
    def create_ngrams(text):
        if pd.isnull(text):
            return []
        combined_ngrams = []
        for n in range(n_range[0], n_range[1] + 1):
            combined_ngrams.extend(list(ngrams(text, n)))
        return combined_ngrams
    
    return column.apply(create_ngrams)

# Function to calculate n-gram similarity


In [135]:
print (data)

                  title metaphoneA metaphoneB
0      jan jagran times  JNJKRNTMS  ANJKRNTMS
1      jagran city plus  JKRNSTPLS  AKRNSTPLS
2       sampurna jagran  SMPRNJKRN           
3         dainik jagran    TNKJKRN           
4         vishwa jagran     FXJKRN           
...                 ...        ...        ...
21394      kaiwart awaz       KRTS      KRTTS
21395   sarbaharar awaz    SRPHRRS   SRPHRRTS
21396    shramiker awaz     XRMKRS    XRMKRTS
21397        sobar awaz       SPRS      SPRTS
21398      awaz aap tak      ASPTK     ATSPTK

[10790 rows x 3 columns]


In [137]:
data.to_csv('n_grams.csv', index=False)

In [136]:
import pandas as pd
from nltk.util import ngrams

# Function to calculate n-gram similarity
def calculate_ngram_similarity(input_ngrams, target_ngrams):
    if not input_ngrams or not target_ngrams:
        return 0
    overlap = set(input_ngrams).intersection(set(target_ngrams))
    return (len(overlap) / len(input_ngrams)) * 100

# Function to generate n-grams for a range of n values
def generate_ngrams_range(text, n_range=(3,3)):
    if not text:
        return []
    combined_ngrams = []
    for n in range(n_range[0], n_range[1] + 1):
        combined_ngrams.extend(list(ngrams(text, n)))
    return combined_ngrams

# Function to find matches based on n-gram similarity
def find_ngram_matches(input_metaphone_a, input_metaphone_b, threshold=20, n_range=(2, 4)):
    # Generate n-grams for input metaphones
    input_ngram_a = generate_ngrams_range(input_metaphone_a, n_range)
    input_ngram_b = generate_ngrams_range(input_metaphone_b, n_range)
    
    results = []
    for _, row in data.iterrows():
        sim_a = calculate_ngram_similarity(input_ngram_a, row['ngram_a'])
        sim_b = calculate_ngram_similarity(input_ngram_b, row['ngram_b'])
        
        if sim_a > threshold or sim_b > threshold:
            results.append({
                'title': row['title'],
                'metaphoneA': row['metaphoneA'],
                'metaphoneB': row['metaphoneB'],
                'similarity_a': sim_a,
                'similarity_b': sim_b
            })
    
    return pd.DataFrame(results)


# Generate n-grams for the data
data['ngram_a'] = data['metaphoneA'].apply(lambda x: generate_ngrams_range(x, n_range=(3, 3)))
data['ngram_b'] = data['metaphoneB'].apply(lambda x: generate_ngrams_range(x, n_range=(3, 3)))

# Find matches
matches = find_ngram_matches(query_metaphoneA, query_metaphoneB, threshold=50)
print(matches)


Empty DataFrame
Columns: []
Index: []
