# Fine-tuning

In [1]:
# Install Hugging Face Transformers, Datasets, and PEFT (for parameter-efficient fine-tuning)
!pip install transformers




# Load the dataset.

In [2]:
import re
import pandas as pd
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# Step 1: Cleaning functions
def remove_emoji(text):
    emoji_pattern = re.compile(
        "["u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols
        u"\U0001F680-\U0001F6FF"  # transport
        u"\U0001F1E0-\U0001F1FF"  # flags
        u"\U00002700-\U000027BF"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_punc_and_special_chars(text):
    return re.sub(r'[\!\@\#\$\%\^\«\»\&\*\(\)\…\[\]\{\}\;\“\”\›\’\‘\"\'\:\,\.\‹\/\<\>\?\\\\|\`\´\~\-\=\+\፡\።\፤\;\፦\፥\፧\፨\፠\፣]', '', text)

def clean_text(text):
    text = remove_emoji(text)
    text = remove_punc_and_special_chars(text)
    return text.strip()

# Step 2: Labeling function
def label_tokens(text):
    tokens = text.split()
    labels = ['O'] * len(tokens)

    # Rule: First line = Product name
    lines = text.split('\n')
    for i, line in enumerate(lines):
        words = line.strip().split()
        if not words:
            continue

        # Rule 1: Product line
        if i == 0 :
            if len(words) > 0:
                labels[tokens.index(words[0])] = "B-Product"
                for w in words[1:]:
                    if w in tokens:
                        labels[tokens.index(w)] = "I-Product"

        # Rule 2: Price info
        if "ዋጋ" in line or "price" in line.lower():
            for j, w in enumerate(words):
                if "ዋጋ" in w or "price" in w.lower():
                    idx = tokens.index(w)
                    labels[idx] = "B-PRICE"
                    # next one or two tokens (price and currency)
                    for k in range(1, 3):
                        if j+k < len(words) and (words[j+k].isdigit() or "ብር" in words[j+k]):
                            tok = words[j+k]
                            if tok in tokens:
                                labels[tokens.index(tok)] = "I-PRICE"

        # Rule 3: Address info
        if "አድራሻ" in line:
            for j, w in enumerate(words):
                if "አድራሻ" in w:
                    try:
                        next_line = lines[i+1].strip().split()
                        if len(next_line) > 0:
                            labels[tokens.index(next_line[0])] = "B-LOC"
                            for w2 in next_line[1:]:
                                if w2 in tokens:
                                    labels[tokens.index(w2)] = "I-LOC"
                    except:
                        pass

    return list(zip(tokens, labels))

# Step 3: Apply to a message
sample_message = """💥💥...................................💥💥

📌Saachi Electric Kettle

👍Borosilicate Glass Body
👍Overheat protection
👍Automatic switch off
👍2200w

ዋጋ፦  💲🏷 2700  ብር ✅

♦️ውስን ፍሬ ነው ያለው🔥🔥🔥

🏢 አድራሻ👉

📍♦️#መገናኛ_መሰረት_ደፋር_ሞል_ሁለተኛ_ፎቅ ቢሮ ቁ. S05/S06"""

# Clean text
cleaned = clean_text(sample_message)

# Token-label pairs
token_label_pairs = label_tokens(cleaned)

# Step 4: Save in CoNLL format
with open("conll_labeled_data.txt", "w", encoding="utf-8") as f:
    for token, label in token_label_pairs:
        f.write(f"{token}\t{label}\n")
    f.write("\n")  # blank line after message


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
df=pd.read_csv('/content/drive/MyDrive/telegram_data.csv')
df.head()

Unnamed: 0,Channel Title,Channel Username,Message,Date,Media Path
0,Zemen Express®,ZemenExpress,,2025-06-21 16:35:51+00:00,photos\ZemenExpress_6994.jpg
1,Zemen Express®,ZemenExpress,,2025-06-21 16:35:51+00:00,photos\ZemenExpress_6993.jpg
2,Zemen Express®,ZemenExpress,,2025-06-21 16:35:51+00:00,photos\ZemenExpress_6992.jpg
3,Zemen Express®,ZemenExpress,💥💥...................................💥💥\n\n📌Sa...,2025-06-21 16:35:51+00:00,photos\ZemenExpress_6991.jpg
4,Zemen Express®,ZemenExpress,,2025-06-21 08:07:31+00:00,photos\ZemenExpress_6990.jpg


In [7]:
df.dropna(subset=['Message'])


Unnamed: 0,Channel Title,Channel Username,Message,Date,Media Path
3,Zemen Express®,ZemenExpress,💥💥...................................💥💥\n\n📌Sa...,2025-06-21 16:35:51+00:00,photos\ZemenExpress_6991.jpg
7,Zemen Express®,ZemenExpress,💥💥...................................💥💥\n\n3pc...,2025-06-21 08:07:31+00:00,photos\ZemenExpress_6987.jpg
8,Zemen Express®,ZemenExpress,💥💥...................................💥💥\n\n3pc...,2025-06-21 08:07:11+00:00,
9,Zemen Express®,ZemenExpress,💥💥...................................💥💥\n\n📌1 ...,2025-06-21 05:42:46+00:00,photos\ZemenExpress_6985.jpg
11,Zemen Express®,ZemenExpress,💥💥...................................💥💥\n\n📌1 ...,2025-06-21 05:42:19+00:00,photos\ZemenExpress_6983.jpg
...,...,...,...,...,...
288,ልዩ እቃ,Leyueqa,🌀🌀🌀የሊጥ ማዞሪያ ለእንጀራ ለ ጨጨብሳ የሚሆን\n💵ዋጋ 3200 💵\n\n...,2025-06-12 08:48:44+00:00,
293,ልዩ እቃ,Leyueqa,🧳🧳🧳Samosa Maker 6 in 1\n\nማኑዋል በቀላሉ ሳንደክም ጉልበት...,2025-06-12 06:35:52+00:00,photos\Leyueqa_7555.jpg
296,ልዩ እቃ,Leyueqa,🔥🔥SOKANY Kettles Electric - Fast Boiling Kettl...,2025-06-11 12:27:14+00:00,photos\Leyueqa_7552.jpg
297,ልዩ እቃ,Leyueqa,🔝🔝🔝ነጭ ሽንኩርት መላጪያ እና የሽንኩርት የስጋ መፍጪያ Kitchen E...,2025-06-11 08:22:59+00:00,


In [11]:
import re
import pandas as pd

# Cleaning Functions
def remove_emoji(text):
    emoji_pattern = re.compile(
        "["u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002700-\U000027BF"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', str(text))

def remove_punc_and_special_chars(text):
    return re.sub(r'[\!\@\#\$\%\^\«\»\&\*\(\)\…\[\]\{\}\;\“\”\›\’\‘\"\'\:\,\.\‹\/\<\>\?\\\\|\`\´\~\-\=\+\፡\።\፤\;\፦\፥\፧\፨\፠\፣]', '', str(text))

def clean_text(text):
    text = remove_emoji(text)
    text = remove_punc_and_special_chars(text)
    return text.strip()
def label_tokens(text):
    tokens = text.split()
    labels = ['O'] * len(tokens)
    lines = text.split('\n')

    for i, line in enumerate(lines):
        words = line.strip().split()
        if not words:
            continue

        if i == 0:
            labels[tokens.index(words[0])] = "B-Product"
            for w in words[1:]:
                if w in tokens:
                    labels[tokens.index(w)] = "I-Product"

        if "ዋጋ" in line or "price" in line.lower():
            for j, w in enumerate(words):
                if "ዋጋ" in w or "price" in w.lower():
                    if w in tokens:
                        labels[tokens.index(w)] = "B-PRICE"
                    for k in range(1, 3):
                        if j+k < len(words):
                            tok = words[j+k]
                            if tok in tokens:
                                labels[tokens.index(tok)] = "I-PRICE"
        if "አድራሻ" in line:
            try:
                next_line = lines[i+1].strip().split()
                if next_line:
                    if next_line[0] in tokens:
                        labels[tokens.index(next_line[0])] = "B-LOC"
                    for w2 in next_line[1:]:
                        if w2 in tokens:
                            labels[tokens.index(w2)] = "I-LOC"
            except:
                pass
    return list(zip(tokens, labels))

df = pd.read_csv('/content/drive/MyDrive/telegram_data.csv')
df = df.dropna(subset=['Message'])

with open("conll_labeled_data.txt", "w", encoding="utf-8") as f:
    for message in df['Message'].head(50):
        if pd.notnull(message):
            cleaned = clean_text(message)
            token_label_pairs = label_tokens(cleaned)
            for token, label in token_label_pairs:
                f.write(f"{token}\t{label}\n")
            f.write("\n")  # separate messages
