In [13]:
import pandas as pd
import numpy as np
import random

In [14]:
df = pd.read_csv('data/Dataset_Generator_for_DTDC.csv')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49639 entries, 0 to 49638
Data columns (total 42 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Origin                 49639 non-null  object 
 1   Destination            49639 non-null  object 
 2   Pouch No               49639 non-null  object 
 3   Date                   49639 non-null  object 
 4   Sender's Name          49639 non-null  object 
 5   Sender Phone           49639 non-null  int64  
 6   Sender Address         49639 non-null  object 
 7   Sender City            49639 non-null  object 
 8   Sender State           49639 non-null  object 
 9   Sender Pincode         49639 non-null  int64  
 10  Sender GSTIN           26778 non-null  object 
 11  Total Pieces           49639 non-null  int64  
 12  Actual Wt              49639 non-null  float64
 13  Volumetric Wt          49639 non-null  float64
 14  Chargeable Wt          49639 non-null  float64
 15  Pa

In [16]:
all_addresses = pd.concat([df['Recipient Address'], df['Sender Address']]).reset_index(drop=True)
n = len(all_addresses)
labels = np.zeros(n, dtype=int)
incomplete_indices = np.random.choice(n, size=int(0.2 * n), replace=False)
labels[incomplete_indices] = 1

In [17]:
address_df = pd.DataFrame({
    'complete_address': all_addresses,
    'address': all_addresses,
    'label': labels
})

In [18]:
def truncate_address(address):
    words = str(address).split()
    if not words:
        return ""
    num_to_remove = len(words) // 2
    indices_to_remove = random.sample(range(len(words)), k=num_to_remove)
    new_words = [word for i, word in enumerate(words) if i not in indices_to_remove]
    return ' '.join(new_words)

address_df.loc[address_df['label'] == 1, 'address'] = address_df.loc[address_df['label'] == 1, 'address'].apply(truncate_address)

In [19]:
# Separate by label
df_complete = address_df[address_df['label'] == 0]
df_incomplete = address_df[address_df['label'] == 1]

# Sample 800 complete and 200 incomplete addresses
sample_complete = df_complete.sample(n=800, random_state=42)
sample_incomplete = df_incomplete.sample(n=200, random_state=42)

# Combine and shuffle
address_df = pd.concat([sample_complete, sample_incomplete]).sample(frac=1).reset_index(drop=True)

In [20]:
print(address_df.head())

                                    complete_address  \
0  62, Guha Zila, Opposite Mall, Pune, Maharashtr...   
1  41/935 Sha Path, Opposite Mall, Ahmedabad, Guj...   
2  26/36 Soman Marg, Behind School, Hyderabad, Te...   
3  525, Khalsa Nagar, Behind School, Varanasi, Ut...   
4  H.No. 51 Srinivas Path, Behind School, Hyderab...   

                                             address  label  
0  62, Guha Zila, Opposite Mall, Pune, Maharashtr...      0  
1            41/935 Opposite Mall, Ahmedabad, 783648      1  
2  26/36 Soman Marg, Behind School, Hyderabad, Te...      0  
3  525, Khalsa Nagar, Behind School, Varanasi, Ut...      0  
4  H.No. 51 Srinivas Path, Behind School, Hyderab...      0  


In [21]:
incomplete_address = address_df[address_df['label'] == 1]['address'].iloc[0]
complete_address = address_df[address_df['label'] == 0]['address'].iloc[0]

print('Incomplete Address Example:', incomplete_address)
print('Complete Address Example:', complete_address)

Incomplete Address Example: 41/935 Opposite Mall, Ahmedabad, 783648
Complete Address Example: 62, Guha Zila, Opposite Mall, Pune, Maharashtra - 515733


In [22]:
# save the synthetic data:
address_df.to_csv('data/synthetic_address_data.csv',index=False)

In [23]:
address_df['label'].value_counts()

label
0    800
1    200
Name: count, dtype: int64

In [24]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load model and tokenizer
model_name = "shiprocket-ai/open-llama-1b-address-completion"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load the merged model (no need for PEFT since weights are already merged)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

def extract_address_components(address, max_new_tokens=150):
    """Extract address components using the model"""
    
    # Format prompt for Llama 3.2-1B-Instruct
    prompt = f"""<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Extract address components from: {address}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

"""
    
    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    
    # FIX: Move inputs to the same device as the model
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=0.1,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.05
        )
    
    # Decode only the new tokens
    input_length = inputs['input_ids'].shape[1]
    generated_tokens = outputs[0][input_length:]
    response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    
    return response.strip()

# Example usage
test_addresses = [
    "C-704, Gayatri Shivam, Thakur Complex, Kandivali East, 400101",
    "Villa 141, Geown Oasis, V Kallahalli, Off Sarjapur, Bengaluru, Karnataka, 562125",
    "E401 Supertech Icon Indrapam 201301 UP"
]

print("🏠 ADDRESS EXTRACTION EXAMPLES")
print("=" * 50)

for i, address in enumerate(test_addresses, 1):
    print(f"\n📍 Example {i}: {address}")
    result = extract_address_components(address)
    print(f"🤖 Extracted: {result}")


🏠 ADDRESS EXTRACTION EXAMPLES

📍 Example 1: C-704, Gayatri Shivam, Thakur Complex, Kandivali East, 400101
🤖 Extracted: {"building": "Gayatri Shivam", "house_details": "C-704", "locality": "Thakur Complex", "pincode": "400101"}

📍 Example 2: Villa 141, Geown Oasis, V Kallahalli, Off Sarjapur, Bengaluru, Karnataka, 562125
🤖 Extracted: {"building": "Geown Oasis", "house_details": "Villa 141", "locality": "V Kallahalli", "city": "Bengaluru", "state": "Karnataka", "pincode": "562125"}

📍 Example 3: E401 Supertech Icon Indrapam 201301 UP
🤖 Extracted: {"building": "Supertech Icon", "house_details": "E401", "city": "Noida", "state": "UP", "pincode": "201301"}


In [25]:
import json

def parse_extracted_address(extracted_str):
    try:
        # The output is a string representation of a dictionary, but might be malformed
        # Let's try to fix it by finding the JSON object within the string
        start = extracted_str.find('{')
        end = extracted_str.rfind('}') + 1
        if start != -1 and end != 0:
            json_str = extracted_str[start:end]
            return json.loads(json_str)
    except (json.JSONDecodeError, SyntaxError):
        return {}
    return {}

# Extract components for all addresses
address_df['extracted_components'] = address_df['address'].apply(extract_address_components)

# Parse the extracted components
address_df['parsed_components'] = address_df['extracted_components'].apply(parse_extracted_address)

# Create new columns from the parsed components
components_df = address_df['parsed_components'].apply(pd.Series)

# Define the desired columns
entity_columns = ['building', 'house_details', 'locality', 'pincode', 'city', 'state']

# Ensure all desired columns exist, filling with NaN if they don't
for col in entity_columns:
    if col not in components_df.columns:
        components_df[col] = np.nan



# Concatenate the new columns with the original dataframe
address_df = pd.concat([address_df, components_df[entity_columns]], axis=1)

# Drop the intermediate columns
address_df = address_df.drop(columns=['extracted_components', 'parsed_components'])

# Display the result
print(address_df.head())

                                    complete_address  \
0  62, Guha Zila, Opposite Mall, Pune, Maharashtr...   
1  41/935 Sha Path, Opposite Mall, Ahmedabad, Guj...   
2  26/36 Soman Marg, Behind School, Hyderabad, Te...   
3  525, Khalsa Nagar, Behind School, Varanasi, Ut...   
4  H.No. 51 Srinivas Path, Behind School, Hyderab...   

                                             address  label building  \
0  62, Guha Zila, Opposite Mall, Pune, Maharashtr...      0      NaN   
1            41/935 Opposite Mall, Ahmedabad, 783648      1      NaN   
2  26/36 Soman Marg, Behind School, Hyderabad, Te...      0      NaN   
3  525, Khalsa Nagar, Behind School, Varanasi, Ut...      0      NaN   
4  H.No. 51 Srinivas Path, Behind School, Hyderab...      0      NaN   

  house_details      locality pincode       city          state  
0            62    Guha Zilla  411533       Pune    Maharashtra  
1        41/935           NaN  382648  Ahmedabad            NaN  
2         26/36           NaN  5

In [26]:
address_df.to_csv('data/synthetic_address_subset_data.csv',index=False)

In [None]:
address_df.head()

Unnamed: 0,complete_address,address,label,building,house_details,locality,pincode,city,state
0,"62, Guha Zila, Opposite Mall, Pune, Maharashtr...","62, Guha Zila, Opposite Mall, Pune, Maharashtr...",0,,62,Guha Zilla,411533,Pune,Maharashtra
1,"41/935 Sha Path, Opposite Mall, Ahmedabad, Guj...","41/935 Opposite Mall, Ahmedabad, 783648",1,,41/935,,382648,Ahmedabad,
2,"26/36 Soman Marg, Behind School, Hyderabad, Te...","26/36 Soman Marg, Behind School, Hyderabad, Te...",0,,26/36,,500006,Hyderabad,Telangana
3,"525, Khalsa Nagar, Behind School, Varanasi, Ut...","525, Khalsa Nagar, Behind School, Varanasi, Ut...",0,,525,Khalsa Nagar,201489,Varanasi,Uttar Pradesh
4,"H.No. 51 Srinivas Path, Behind School, Hyderab...","H.No. 51 Srinivas Path, Behind School, Hyderab...",0,,H.No. 51,,500095,Hyderabad,Telangana
