In [3]:
import pandas as pd
from transformers import AutoTokenizer
from amseg.amharicSegmenter import AmharicSegmenter
import os


## Read labeled data from a text file

In [4]:

# Read labeled data from a text file
with open('C:/Users/Administrator/Documents/10Academy/week 5/Technical Content/Data/labeled_telegram_product_price_location.txt', 'r') as file:
    lines = file.readlines()

## Process lines

In [5]:
# Process lines as needed
data = [line.strip().split('\t') for line in lines]  # Adjust the split based on your delimiter
df = pd.DataFrame(data)

# Process lines: separate tokens and labels
data = [line.strip().split() for line in lines if line.strip()]  # Split based on spaces
tokens = [item[0] for item in data]  # Extract tokens
labels = [item[1] for item in data]  # Extract labels

## Initialize the tokenizer

In [6]:

model_name = "xlm-roberta-base"  # Change to any appropriate model from Hugging Face if needed
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize and align labels for both Amharic and English texts
def tokenize_and_align_labels(tokenizer, tokens, labels):
    aligned_tokens = []
    aligned_labels = []

    for word, label in zip(tokens, labels):
        tokenized_word = tokenizer.tokenize(word)  # Tokenize the word
        aligned_tokens.extend(tokenized_word)  # Add tokens to the list

        # Assign the label to the first subtoken and 'O' to subsequent subtokens
        aligned_labels.extend([label] + ['O'] * (len(tokenized_word) - 1))

    return aligned_tokens, aligned_labels

# Tokenize and align labels
aligned_tokens, aligned_labels = tokenize_and_align_labels(tokenizer, tokens, labels)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]



## Display the first 20 results

In [7]:

print("Aligned Tokens and Labels:")
for token, label in zip(aligned_tokens[:20], aligned_labels[:20]):
    print(f"{token:20} {label}")

Aligned Tokens and Labels:
▁3                   B-PRODUCT
pc                   O
s                    O
▁si                  I-PRODUCT
li                   O
con                  O
▁brush               I-PRODUCT
▁spa                 I-PRODUCT
tul                  O
as                   O
▁እስከ                 O
▁2                   O
60°                  O
c                    O
▁ሙ                   O
ቀት                   O
▁መቆ                  O
ቆም                   O
▁የሚችል                O
▁ዋጋ                  I-PRICE


## Initialize the Amharic segmenter

In [13]:

segmenter = AmharicSegmenter(sent_punct, word_punct)

# Function to align tokens with their respective labels for Amharic
def align_tokens_with_labels(segmenter, tokens, labels):
    aligned_tokens = []
    aligned_labels = []

    for word, label in zip(tokens, labels):
        if any(char in word for char in 'አ-ፈ'):  # Check if the word contains Amharic characters
            tokenized_word = segmenter.amharic_tokenizer(word)  # Tokenize the word using Amharic segmenter
        else:
            tokenized_word = [word]  # Keep the word as it is if not Amharic

        aligned_tokens.extend(tokenized_word)  # Add tokens to the list
        aligned_labels.extend([label] + ['O'] * (len(tokenized_word) - 1))

    return aligned_tokens, aligned_labels

# Align tokens and labels for Amharic
new_tokens, new_labels = align_tokens_with_labels(segmenter, tokens, labels)

# Output the first 20 aligned tokens and labels
print("Amharic Tokens and Labels:")
for token, label in zip(new_tokens[:20], new_labels[:20]):
    print(f"{token}: {label}")


Amharic Tokens and Labels:
3pcs: B-PRODUCT
silicon: I-PRODUCT
brush: I-PRODUCT
spatulas: I-PRODUCT
እስከ: O
260°c: O
ሙቀት: O
መቆቆም: O
የሚችል: O
ዋጋ-550ብር: I-PRICE
አድራሻ: O
ቁ.1: O
ስሪ: O
ኤም: O
ሲቲ: O
ሞል: O
ሁለተኛ: O
ፎቅ: O
ቢሮ: O
ቁ.: O


## Saving the final tokens and labels to a CSV file

In [14]:

output_df = pd.DataFrame({'Token': new_tokens, 'Label': new_labels})
output_df.to_csv('C:/Users/Administrator/Documents/kifiya/Week_5/final_tokens_labels.csv', index=False)


## Display results from index 50 to 80 with formatting

In [15]:

print("Display Results from Index 50 to 80:")
for token, label in zip(new_tokens[50:80], new_labels[50:80]):
    print(f"{token:<20} {label}")


Display Results from Index 50 to 80:
Slicer               I-PRODUCT
ጊዜ                   O
ቆጣቢ                  O
ስላይስ                 O
ማድረጊያ                O
ለእጅ                  O
ሴፍቲ                  O
ተመራጭ                 O
ለድንች                 O
ለካሮትና                O
ሌሎች                  O
አታክልቶች               O
ተመራጭ                 O
ጥራት                  O
ያለው                  O
ዕቃ                   O
ዋጋ፦                  I-PRICE
1,200                O
ብር                   I-PRICE
አድራሻ                 O
ቁ.1                  O
ስሪ                   O
ኤም                   O
ሲቲ                   O
ሞል                   O
ሁለተኛ                 O
ፎቅ                   O
ቢሮ                   O
ቁ.                   O
SL-05A(ከ             O


## Load the dataset for additional processing

In [16]:

from datasets import load_dataset
import sentencepiece as spm


In [17]:
# Load the dataset
dataset = load_dataset("israel/Amharic-News-Text-classification-Dataset")

# Extract the text from the dataset
texts = dataset['train']['article']

# Filter out None values and prepare the data for SentencePiece
filtered_texts = [text for text in texts if text is not None]  # Remove None values
text_data = "\n".join(filtered_texts)

# Save to a temporary file
with open('temp_text.txt', 'w', encoding='utf-8') as f:
    f.write(text_data)

README.md:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


train.csv:   0%|          | 0.00/150M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/37.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/41186 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10297 [00:00<?, ? examples/s]

## Train SentencePiece model

In [18]:
spm.SentencePieceTrainer.train('--input=temp_text.txt --model_prefix=AmharicSPM --vocab_size=100000')

### Tokenize 

In [19]:
# Initialize SentencePiece tokenizer
tokenizer_spm = spm.SentencePieceProcessor(model_file='AmharicSPM.model')

# Tokenize the input text with SentencePiece
text = "ቁ.2 ለቡ መዳህኒዓለም and here is some English text."
tokens_spm = tokenizer_spm.encode(text, out_type=str)  # Use encode method for tokenization

# Print the tokenized words
print("SentencePiece Tokenization Results:")
print(tokens_spm)

SentencePiece Tokenization Results:
['▁ቁ', '.', '2', '▁ለቡ', '▁መዳ', 'ህ', 'ኒ', 'ዓ', 'ለም', '▁and', '▁here', '▁is', '▁some', '▁Engl', 'ish', '▁', 'te', 'x', 't', '.']


### Process lines: separate tokens and labels again for SentencePiece

In [20]:

data = [line.strip().split() for line in lines if line.strip()]  # Split based on spaces
tokens = [item[0] for item in data]  # Extract tokens
labels = [item[1] for item in data]  # Extract labels

### Tokenization and alignment with SentencePiece

In [21]:

def tokenize_and_align_labels_spm(tokenizer_spm, tokens, labels):
    aligned_tokens = []
    aligned_labels = []
    token_ids = []

    for word, label in zip(tokens, labels):
        tokenized_ids = tokenizer_spm.encode(word, out_type=int)  # Get token IDs
        tokenized_words = tokenizer_spm.decode(tokenized_ids).split()  # Decode back to words

        aligned_tokens.extend(tokenized_words)  # Add decoded words to the list
        token_ids.extend(tokenized_ids)  # Add token IDs to the list

        aligned_labels.extend([label] + ['O'] * (len(tokenized_words) - 1))

    return aligned_tokens, token_ids, aligned_labels

# Tokenize and align labels using SentencePiece
custom_tokens, custom_ids, custom_labels = tokenize_and_align_labels_spm(tokenizer_spm, tokens, labels)

# Display results from index 50 to 80 with tokenized words, their IDs, and labels
print("Custom Tokenization Results (Index 50 to 80):")
for token, token_id, label in zip(custom_tokens[50:80], custom_ids[50:80], custom_labels[50:80]):
    print(f"{token:<25} {token_id:<10} {label}")


Custom Tokenization Results (Index 50 to 80):
ዛም                        10519      O
⁇                         0          O
ሞል                        618        O
2ኛ                        29060      O
ፎቅ                        12940      O
ቢሮ                        531        O
ቁጥር                       2468       O
⁇                         0          O
214                       6561       O
ለቡ                        1110       I-LOC
ቅርንጫፍ0973611819           0          O
0909522840                49632      O
0923350054                1109       O
በTelegram                 4364       O
ለማዘዝ                      8          O
ይጠቀሙ                      0          O
⁇                         2778       O
shager                    5          O
⁇                         0          O
onlinestore               54941      O
ለተጨማሪ                     190        O
ማብራሪያ                     77         O
የቴሌግራም                    6723       O
ገፃችን                      226        O
https         