In [1]:
# Import necessary libraries
import pandas as pd
from google.colab import drive
import os

drive.mount('/content/drive')


csv_file_path = '/content/drive/MyDrive/preprocessed_telegram_data.csv'

if not os.path.exists(csv_file_path):
    print(f"Error: CSV file not found at {csv_file_path}")
    print("Please ensure the CSV file is in the specified Google Drive path and the path is correct.")
else:
    print(f"CSV file found at {csv_file_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
CSV file found at /content/drive/MyDrive/preprocessed_telegram_data.csv


In [2]:
# Load the preprocessed data
try:
    df = pd.read_csv(csv_file_path)
    print("DataFrame loaded successfully!")
    print(f"Total messages: {len(df)}")
    print("First 5 rows of the DataFrame:")
    print(df.head())
except Exception as e:
    print(f"Error loading CSV: {e}")

DataFrame loaded successfully!
Total messages: 1500
First 5 rows of the DataFrame:
    Channel Title Channel Username  Message ID  \
0  Zemen Express®    @ZemenExpress        7004   
1  Zemen Express®    @ZemenExpress        7003   
2  Zemen Express®    @ZemenExpress        7002   
3  Zemen Express®    @ZemenExpress        7001   
4  Zemen Express®    @ZemenExpress        7000   

                                             Message  \
0  Electric Charcoal Burner በቀላሉ ከሰል ለማያያዝ የሚሆን አ...   
1                                                NaN   
2                                                NaN   
3                                                NaN   
4  Electric Charcoal Burner በቀላሉ ከሰል ለማያያዝ የሚሆን አ...   

                        Date                 Media Path  
0  2025-06-23 14:55:46+00:00  Photo attached (ID: 7004)  
1  2025-06-23 14:55:40+00:00  Photo attached (ID: 7003)  
2  2025-06-23 14:55:40+00:00  Photo attached (ID: 7002)  
3  2025-06-23 14:55:40+00:00  Photo attached (I

In [4]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize

# --- NLTK Data Downloads ---
try:
    # Attempt to download the main 'punkt' tokenizer data
    nltk.download('punkt', quiet=True)
    print("NLTK 'punkt' data downloaded successfully.")
except Exception as e:
    print(f"Error downloading NLTK 'punkt' data: {e}. Please ensure you have an internet connection.")

try:
    # Explicitly download 'punkt_tab' as well, which is sometimes required by word_tokenize
    # especially for certain language models or internal structures.
    nltk.download('punkt_tab', quiet=True)
    print("NLTK 'punkt_tab' data downloaded successfully.")
except Exception as e:
    print(f"Error downloading NLTK 'punkt_tab' data: {e}. Please ensure you have an internet connection.")


labeled_data_conll = []

# Number of messages to label (50)
num_messages_to_label = 50

print(f"\n--- Starting Manual CoNLL Labeling for {num_messages_to_label} Messages ---")
print("Instructions: For each token, provide its CoNLL label (B-Product, I-Product, B-LOC, I-LOC, B-PRICE, I-PRICE, O).")
print("Type 'done' when you are finished labeling a message. Type 'skip' to move to the next message.")
print("Example: 'ይህ B-Product ስልክ I-Product 5000 B-PRICE ብር I-PRICE ነው። O'")


try:
    df = pd.read_csv(csv_file_path)
    if 'Message' not in df.columns or 'Message ID' not in df.columns:
        raise ValueError("DataFrame must contain 'Message' and 'Message ID' columns.")
except FileNotFoundError:
    print(f"Error: CSV file not found at '{csv_file_path}'. Please check the path and try again.")
    # Exit or handle the error gracefully if the file isn't found
    exit()
except Exception as e:
    print(f"Error loading CSV or processing DataFrame: {e}")
    exit()

# Iterate through the messages in the DataFrame
for i, message_row in df.head(num_messages_to_label).iterrows():
    message_text = str(message_row['Message']).strip() # Ensure it's a string
    # Get Message ID safely, in case the column name has slight variations
    message_id = message_row.get('Message ID', f"No_ID_{i+1}")

    if not message_text:
        print(f"\nMessage {i+1} (ID: {message_id}): [No text content - Skipping]")
        continue

    print(f"\n--- Message {i+1} (ID: {message_id}) ---")
    print(f"Original Message: {message_text}")

    # Tokenize the message (using NLTK's word_tokenize)
    tokens = word_tokenize(message_text) # Use imported word_tokenize directly

    current_labeled_message = []
    print("\nTokens and their labels:")
    for token in tokens:
        label = input(f"Token: '{token}' -> Label: ").strip()
        while label not in ['B-Product', 'I-Product', 'B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE', 'O', 'skip', 'done']:
            print("Invalid label. Please use one of: B-Product, I-Product, B-LOC, I-LOC, B-PRICE, I-PRICE, O, skip, done")
            label = input(f"Token: '{token}' -> Label: ").strip()

        if label == 'skip':
            current_labeled_message = [] # Clear any partial labels for this message
            break
        elif label == 'done':
            # This break will exit the inner loop (token labeling for current message)
            # and the current_labeled_message will be appended if it has content
            break
        else:
            current_labeled_message.append(f"{token}\t{label}")

    if current_labeled_message:
        labeled_data_conll.extend(current_labeled_message)
        labeled_data_conll.append("") # Add a blank line to separate sentences/messages in CoNLL

print("\n--- Manual Labeling Complete ---")
print(f"Total lines in CoNLL format (including blank lines): {len(labeled_data_conll)}")

# Save the labeled data to a plain text file
output_filename = "labeled_amharic_ner_data.conll"
with open(output_filename, "w", encoding="utf-8") as f:
    for line in labeled_data_conll:
        f.write(line + "\n")

print(f"\nLabeled data saved to {output_filename}")

NLTK 'punkt' data downloaded successfully.
NLTK 'punkt_tab' data downloaded successfully.

--- Starting Manual CoNLL Labeling for 50 Messages ---
Instructions: For each token, provide its CoNLL label (B-Product, I-Product, B-LOC, I-LOC, B-PRICE, I-PRICE, O).
Type 'done' when you are finished labeling a message. Type 'skip' to move to the next message.
Example: 'ይህ B-Product ስልክ I-Product 5000 B-PRICE ብር I-PRICE ነው። O'
DataFrame loaded successfully!
Total messages: 1500
First 5 rows of the DataFrame:
    Channel Title Channel Username  Message ID  \
0  Zemen Express®    @ZemenExpress        7004   
1  Zemen Express®    @ZemenExpress        7003   
2  Zemen Express®    @ZemenExpress        7002   
3  Zemen Express®    @ZemenExpress        7001   
4  Zemen Express®    @ZemenExpress        7000   

                                             Message  \
0  Electric Charcoal Burner በቀላሉ ከሰል ለማያያዝ የሚሆን አ...   
1                                                NaN   
2                         