In [None]:
import os
import re
import docx
from tqdm import tqdm
from collections import defaultdict
from langchain_text_splitters import RecursiveCharacterTextSplitter
import pandas as pd

# Initialize text splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len, is_separator_regex=False)
# Define functions for processing
# def get_docx_files_dict(folder_path):
#     """Get a dictionary of .docx files with their 3GPP spec numbers."""
#     result = {}
#     count = 0
#     for root, dirs, files in os.walk(folder_path):
#         for file in files:
#             if file.endswith('.docx'):
#                 file_path = os.path.join(root, file)
#                 match = re.search(r'(\d{5})-', file)
#                 if match:
#                     spec_number = match.group(1)
#                     series = spec_number[:2]
#                     spec_number = spec_number[2:]
#                     key = f"3GPP TS {series}.{spec_number}"
#                     result[count] = [key, file_path]
#                     count += 1
#     return result



def get_docx_files_dict(folder_path):
    """Get a dictionary of .docx files with their 3GPP spec numbers."""
    result = {}
    count = 0
    
    # Check if the folder path exists
    if not os.path.exists(folder_path):
        raise ValueError(f"The specified folder path does not exist: {folder_path}")
    
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.docx'):
                file_path = os.path.join(root, file)
                match = re.search(r'(\d{5})-', file)
                
                if match:
                    spec_number = match.group(1)
                    series = spec_number[:2]
                    spec_number = spec_number[2:]
                    key = f"3GPP TS {series}.{spec_number}"
                    result[count] = [key, file_path]
                    count += 1
    
    return result

def extract_abb(docpath, pattern):
    """Extract abbreviations from the provided docx file."""
    doc = docx.Document(docpath)
    abbreviation_section_started = False
    abbreviations = []
    count = 0
    max_lines = 200
    cnt = 0
    for paragraph in doc.paragraphs:
        if 'Abbreviations' in paragraph.text:
            abbreviation_section_started = True
            cnt += 1
            continue
        if cnt < 2:
            continue
        if abbreviation_section_started and count < max_lines:
            if paragraph.text.strip():
                abbreviations.append(paragraph.text.strip())
            count += 1
        if count >= max_lines:
            break
    abbreviation_dict = {}
    for line in abbreviations:
        parts = line.split('\t')
        if len(parts) != 2:
            continue
        if re.match(pattern, parts[0]):
            abbreviation_dict[parts[0]] = parts[1]
    return abbreviation_dict

def extract_sections_from_docx(docx_file):
    """Extract sections from a .docx file."""
    try:
        doc = docx.Document(docx_file)
    except Exception as e:
        print(f"Error opening {docx_file}: {e}")
        return {}
    sections = {}
    current_title = None
    current_content = []
    for para in doc.paragraphs:
        if para.style.name.startswith('Heading'):
            if current_title:
                sections[current_title] = "\n".join(current_content)
            current_title = para.text.strip()
            current_content = []
        else:
            if para.text.strip():
                current_content.append(para.text.strip())
    if current_title:
        sections[current_title] = "\n".join(current_content)
    return sections

def get_context_metadata(folder_path, pattern, skip_keywords, text_splitter):
    """Create a metadata mapping from abbreviations to sections."""
    docs = get_docx_files_dict(folder_path)
    context_to_metadata = defaultdict(list)
    for key, val in tqdm(docs.items()):
        doc_number, doc_path = val[0], val[1]
        sections = extract_sections_from_docx(doc_path)
        main_title = ""
        for section_title, content in sections.items():
            title = re.sub(r'\s+', ' ', section_title.strip())
            content = re.sub(r'\s+', ' ', content.strip())
            if any(keyword.lower() in title.lower() for keyword in skip_keywords):
                if 'Abbreviations'.lower() in title.lower():
                    continue
            if content == "":
                main_title = title
                continue
            if "general" in title.lower():
                title = f"{main_title} {title}"
            texts = text_splitter.create_documents([content])
            chunks = [chunk.page_content for chunk in texts]
            for text in chunks:
                context_to_metadata[text] = [section_title, doc_number]
        print(len(context_to_metadata))
    return context_to_metadata

def get_exact_count(abbreviation, list_content):
    """Get the exact frequency of an abbreviation in the content."""
    freq_count = 0
    best_context = ""
    for content in list_content:
        count = content.lower().split().count(abbreviation.lower())
        if count > freq_count:
            freq_count = count
            best_context = content
    print(f"Best context for '{abbreviation}' (appeared {freq_count} times): {best_context[:100]}") 
    return best_context, freq_count


def process_abbreviations(folder_path, pattern):
    """Extract abbreviations from all docx files and build the abbreviation list."""
    all_abbreviations = []
    processed_abbreviation_set = set()
    docx_files = get_docx_files_dict(folder_path)
    for _, (key, file_path) in tqdm(docx_files.items()):
        processed_abbreviations = extract_abb(file_path, pattern)
        print(f"Extracted abbreviations from {file_path}: {processed_abbreviations}")
        for abb, full_form in processed_abbreviations.items():
            if abb not in processed_abbreviation_set:
                processed_abbreviation_set.add(abb)
                all_abbreviations.append({"name": abb, "full_form": full_form})
    print(f"Total unique abbreviations extracted: {len(all_abbreviations)}")
    return all_abbreviations

def collected_data(output_path, all_abbreviations, meta_data):
    """Save the processed abbreviations to a CSV file with relevant metadata."""
    list_context = list(meta_data.keys())
    univ = []
    for entry in all_abbreviations:
        name = entry['name']
        full_form = entry['full_form']
        rich_context, freq = get_exact_count(name, list_context)
        print(rich_context)
        if freq <= 1:
            continue
        metadata_list = meta_data.get(rich_context, [])
        if len(metadata_list) < 1:
            print(f"Skipping entry for {name}: Incomplete metadata for context.")
            continue
        rec = {
            "name": name,
            "full_form": full_form,
            "context": rich_context,
            "section_title": metadata_list[0],
            "doc_number": metadata_list[1]
        }
        print(rec)
        univ.append(rec)
    if univ:
        data = pd.DataFrame(univ)
        data.to_csv(output_path, index=False)
        print(f"Successfully saved {len(univ)} entries to output file: {output_path}")
    else:
        print("No valid data to save.")

# Example usage
folder_path = '/home/sunny/Desktop/SunnyKG/specs_22_31_33'
pattern = r'\b(?!NOTE\b)(?!NOTE\d)(?!\d+$)(?!.*[.)])(?![A-Z0-9]{1}\b)(?!\d+[>])(?<![^\s])(?<![\d])(?<!\d\s)[A-Z0-9]+(?:-[A-Z0-9]+)*\b'
skip_keywords = [
    "Foreword", 
    "Scope", 
    "References", 
    "Definitions", 
    "Abbreviations", 
    "Definitions and abbreviations", 
    "Definitions, symbols and abbreviations"
]

meta_data = get_context_metadata(folder_path, pattern, skip_keywords, text_splitter)

all_abbreviations = process_abbreviations(folder_path, pattern)
output_path = "Ran_collected_data_22_31_33.csv"
collected_data(output_path, all_abbreviations, meta_data)


# above code for automation data collection preprocessing

## below code for mapping to working group

In [None]:
speci=pd.read_csv("/home/sunny/Desktop/SunnyKG/prev_method/specifications.csv")
speci_wkg={}
for _,row in speci.iterrows():
    wkg=row['name']
    doc_title=row['spec_detail']
    doc_number=row['spec_number']
    speci_wkg[doc_number]=[wkg,doc_title]
import re
pattern = r'\bTS \d{2}\.\d{3}\b'
def extract_ts_pattern(text):
    match = re.search(pattern, text)
    if match:
        return match.group(0)  
    return None
data=pd.read_csv("/home/sunny/Desktop/SunnyKG/prev_method/Ran_collected_data_32_38_34.csv")

data['doc_title'] = (
    data['doc_number']
    .apply(lambda x: extract_ts_pattern(x))
    .apply(lambda x: speci_wkg.get(x, [None, None])[1])  
)

data['working_Group'] = (
    data['doc_number']
    .apply(lambda x: extract_ts_pattern(x))
    .apply(lambda x: speci_wkg.get(x, [None, None])[0]) 
)

# Display the DataFrame to verify
print(data.head())


In [None]:
data.head()

In [None]:
data.to_csv("Ran_data_wkg.csv",index=False)

##context evaluataion

In [19]:
import re
import random
import pandas as pd
from langchain_ollama import ChatOllama
from tqdm import tqdm
import json
system_prompt = """
You are a context evaluator specializing in telecommunications specifications. Your task is to interpret the provided context and accurately fill in the missing term marked as "<fill_here>" by selecting the most appropriate word from four given options. Your response should provide only the selected term, which best completes the sentence technically and contextually.

### Instructions:
1. **Contextual Understanding**: Analyze the context sentence carefully to understand the meaning and technical requirements of the missing term.
2. **Selection Criteria**:
   - **Technical Relevance**: The word chosen should be accurate in the 3GPP telecommunications domain.
   - **Contextual Fit**: Select the option that makes the most sense within the sentence provided, ensuring coherence and relevance.
3. **Strict Output Requirement**: 
   - Your output must strictly follow this format: **"Selected word: <Chosen Option>"**.
   - Do not include additional information, explanations, symbols, numbers, or line breaks.
   - Only return one word from the provided options, exactly as instructed.

### Input Format:
{
  "context": "Sentence containing <fill_here> placeholders",
  "options": ["Option1", "Option2", "Option3", "Option4"]
}

### Output Format:
Selected word: <Chosen Option>

### Example Input:
{
  "context": "The primary role of the <fill_here> function is to manage network connectivity for user equipment.",
  "options": ["AMF", "RRC", "IMSI", "QoS"]
}

### Expected Output:
Selected word: AMF

### Note:
You are a **context evaluator**. Choose only one word from the provided options that best completes the "<fill_here>" term in the context sentence.

"""
models=["nemotron:latest","llama3.1:70b","gemma2:9b"]
def predicted(model,input_string):
   
  llm=llm = ChatOllama(model=model, temperature=0, format='json')
  message=[
  ("system",system_prompt),("human",f"context:{input_string}")
]
  ai_msg=llm.invoke(message)
  print("AI Message Content:", ai_msg.content.strip())
  return ai_msg.content.strip()

def mask_term(context, term):
    pattern = rf'\b{re.escape(term)}\b'  # Match only the whole word
    masked_context = re.sub(pattern, "<fill_here>", context)
    return masked_context

def get_random_options(terms_list, correct_term):
    if len(terms_list) < 4:
        raise ValueError("The list must contain at least four terms.")
    options = random.sample(terms_list, 4)
    if correct_term not in options:
        options[random.randint(0, 3)] = correct_term
    return options


data = pd.read_csv("/home/sunny/Desktop/SunnyKG/prev_method/data_25_37_36.csv")
all_abb = dict(zip(data['name'], data['full_form']))
vocab = list(data['name'].unique())
masked_data = {}
accuracy={}
for model in models:
  corrected=0
  total=len(vocab)
  for index, row in data.iterrows():
      name = row['name']
      content = row['context']
      masked_context = mask_term(content, name)
      options = get_random_options(vocab, name)
      input_string=str({"masked_context":masked_context,"options":options
          
      })
      
      corrected+=(json.loads(predicted(model,input_string))['Selected word'].lower())==name.lower()
  print(f"model {model} given this much accuracy for context")
  accuracy[model]=corrected/total
  
  
  



AI Message Content: {  
  "Selected word": "ACLR"
}
AI Message Content: {  
  "Selected word": "ACS"
}
AI Message Content: {  
  "Selected word": "AWGN"
}
AI Message Content: {
  "Selected word": "BW"
}
AI Message Content: {  
  "Selected word": "CW"
}
AI Message Content: {  
  "Selected word": "DL"
}
AI Message Content: {
  "Selected word": 
    -1.2 
      












    }


AttributeError: 'float' object has no attribute 'lower'