In [1]:
import pandas as pd
import numpy as np
import pickle
import os

In [2]:
from dotenv import load_dotenv
import os

# Load environment variables from the .env file
load_dotenv()

# Access the OpenAI API key from the .env file
api_key = os.getenv("OPENAI_API_KEY")
langsmith_api_key = os.getenv("langsmith_api_key")

# Set additional environment variables programmatically
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_API_KEY"] = "langsmith_api_key"
os.environ["LANGCHAIN_PROJECT"] = "SLIFTEX"


In [3]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model="gpt-4", openai_api_key=api_key)


  llm = ChatOpenAI(model="gpt-4", openai_api_key=api_key)


In [4]:
#Data Preprocessing

# Step 1: Open the file in read-binary mode
with open('data.pkl', 'rb') as file:
    # Step 2: Load the data from the file
    data = pickle.load(file)

# Now, `data` contains the deserialized Python object
print(data)


0        JAN JAGRAN TIMES
1        JAGRAN CITY PLUS
2         SAMPURNA JAGRAN
3           DAINIK JAGRAN
4           VISHWA JAGRAN
               ...       
21394        KAIWART AWAZ
21395     SARBAHARAR AWAZ
21396      SHRAMIKER AWAZ
21397          SOBAR AWAZ
21398        AWAZ AAP TAK
Name: Title Name, Length: 10790, dtype: object


In [5]:
df = pd.DataFrame(data)
print (df.head())


         Title Name
0  JAN JAGRAN TIMES
1  JAGRAN CITY PLUS
2   SAMPURNA JAGRAN
3     DAINIK JAGRAN
4     VISHWA JAGRAN


In [6]:
df.drop_duplicates(inplace=True)
print(df)
print(df.shape)

             Title Name
0      JAN JAGRAN TIMES
1      JAGRAN CITY PLUS
2       SAMPURNA JAGRAN
3         DAINIK JAGRAN
4         VISHWA JAGRAN
...                 ...
21394      KAIWART AWAZ
21395   SARBAHARAR AWAZ
21396    SHRAMIKER AWAZ
21397        SOBAR AWAZ
21398      AWAZ AAP TAK

[10790 rows x 1 columns]
(10790, 1)


In [7]:
df.rename(columns={'Title Name': 'text'}, inplace=True)

#Data Ingestion
from langchain.document_loaders import DataFrameLoader
loader = DataFrameLoader(df)


In [8]:
documents = loader.load()
print(documents)



In [9]:
#Transform

from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=20)
split_docs = text_splitter.split_documents(documents)  


In [10]:
#Vector Embedding
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OpenAIEmbeddings
db = FAISS.from_documents(documents[:10], OpenAIEmbeddings()) 
print(db)


  db = FAISS.from_documents(documents[:10], OpenAIEmbeddings())


<langchain_community.vectorstores.faiss.FAISS object at 0x0000019712E23170>


In [11]:
#Test if db works
query= "JAGRAN"
res= db.similarity_search(query)
print(res)

[Document(metadata={}, page_content='JAN JAGRAN'), Document(metadata={}, page_content='JAGRAN CITY PLUS'), Document(metadata={}, page_content='JAN JAGRAN TIMES'), Document(metadata={}, page_content='DEV JAGRAN NEWS')]


In [12]:
# Chat Prompt
context = db 


In [13]:
from langchain_core.prompts import ChatPromptTemplate
prompt_template = """You are a title verification assistant for the Press Registrar General of India. Your task is to evaluate new title submissions based on similarity with existing titles, compliance with disallowed words, prefixes/suffixes, and other guidelines.

**Requirements**:
1. Calculate and return the similarity score between the input title and a list of provided existing titles. The similarity should account for:
   - Phonetic similarity (e.g., Soundex or Metaphone).
   - Common prefixes/suffixes (e.g., "The," "India," "News").
   - Spelling variations or slight modifications.
   - Semantic similarity, including translations or similar meanings in other languages.
2. If the input title violates any of the following guidelines, provide a clear reason for rejection:
   - Contains disallowed words (e.g., Police, Crime, Corruption, CBI, Army).
   - Combines existing titles (e.g., "Hindu" and "Indian Express" forming "Hindu Indian Express").
   - Adds periodicity (e.g., "Daily," "Weekly," "Monthly") to an existing title.
3. Provide a probability score for verification using the formula:  
   `Verification Probability = 100% - Similarity Score`.
4. Include actionable feedback for users to modify and resubmit their titles if rejected.

**Example Input**:  
- Input Title: "Daily Jagran News"  
- Existing Titles: ["Jagran News", "Daily Samachar", "Morning Express"]  

**Example Output**:  
- Similarity Score: 85%  
- Verification Probability: 15%  
- Rejection Reasons:  
  1. Similar to "Jagran News" (phonetic similarity).  
  2. Contains a disallowed prefix ("Daily").  
- Feedback: Remove the prefix "Daily" and ensure the title is unique.

Now, evaluate the following:

**Input Title**: {input}  
**Existing Titles**: {context}  
**Disallowed Words**: ["Police", "Crime", "Corruption", "CBI", "Army"]  
**Disallowed Prefixes/Suffixes**: ["Daily", "Weekly", "Monthly", "The", "India", "News"]

"""

In [16]:
from langchain.prompts import PromptTemplate  # Add this import

prompt = PromptTemplate(
    input_variables=["context", "title_to_verify"],
    template=prompt_template
)
existing_titles = context


In [17]:
#import model gpt 4
llm = ChatOpenAI(model="gpt-4", openai_api_key=api_key)


In [18]:
from langchain.chains.combine_documents import create_stuff_documents_chain
document_chain = create_stuff_documents_chain(llm, prompt)



In [19]:
documents = loader.load()
print(documents)



In [20]:
#retriever
retriever = db.as_retriever()
from langchain.chains import create_retrieval_chain
retriever_chain  = create_retrieval_chain(retriever, document_chain)

In [21]:
response = retriever_chain.invoke({"input" :"Daily Jagran News"})

In [22]:
print(response)

{'input': 'Daily Jagran News', 'context': [Document(metadata={}, page_content='DAINIK JAGRAN'), Document(metadata={}, page_content='DAINIK JAGRAN RASHTRIYA SANSKARAN'), Document(metadata={}, page_content='DEV JAGRAN NEWS'), Document(metadata={}, page_content='SAMPURNA JAGRAN')], 'answer': '**Similarity Score**: 95%  \n**Verification Probability**: 5%  \n**Rejection Reasons**:  \n  1. Similar to "DAINIK JAGRAN" (phonetic similarity).\n  2. Contains a disallowed prefix ("Daily").\n  3. Similar to "DEV JAGRAN NEWS" (common suffix "News").\n  \n**Feedback**: The title "Daily Jagran News" is too similar to existing titles. Remove the prefix "Daily" and the suffix "News" to make the title unique. Ensure that the title does not resemble or phonetically match any existing titles.'}


In [23]:
from pprint import pprint
pprint(response)

{'answer': '**Similarity Score**: 95%  \n'
           '**Verification Probability**: 5%  \n'
           '**Rejection Reasons**:  \n'
           '  1. Similar to "DAINIK JAGRAN" (phonetic similarity).\n'
           '  2. Contains a disallowed prefix ("Daily").\n'
           '  3. Similar to "DEV JAGRAN NEWS" (common suffix "News").\n'
           '  \n'
           '**Feedback**: The title "Daily Jagran News" is too similar to '
           'existing titles. Remove the prefix "Daily" and the suffix "News" '
           'to make the title unique. Ensure that the title does not resemble '
           'or phonetically match any existing titles.',
 'context': [Document(metadata={}, page_content='DAINIK JAGRAN'),
             Document(metadata={}, page_content='DAINIK JAGRAN RASHTRIYA SANSKARAN'),
             Document(metadata={}, page_content='DEV JAGRAN NEWS'),
             Document(metadata={}, page_content='SAMPURNA JAGRAN')],
 'input': 'Daily Jagran News'}
