In [113]:
import pandas as pd
from transformers import AutoTokenizer
from amseg.amharicSegmenter import AmharicSegmenter
import sentencepiece as spm
import emoji
import re


In [114]:
# Load the telegram dataset
file_path = 'C:/Users/Administrator/Documents/kifiya/Week_5/telegram_data.csv'
df = pd.read_csv(file_path)

In [115]:
# Function to remove emojis (with type checking)
def remove_emoji(text):
    if isinstance(text, str):
        return emoji.replace_emoji(text, replace='')
    return text  # Return the original value if it's not a string

# Function to remove symbols and non-alphabetic characters (with type checking)
def remove_symbols(text):
    if isinstance(text, str):
        return re.sub(r'[^A-Za-z0-9ሀ-ፐ\s]+', '', text)
    return text  # Return the original value if it's not a string

# Apply the functions to the 'Message' column
df['Message'] = df['Message'].apply(remove_emoji).apply(remove_symbols)
df['Message'] = df['Message'].fillna('')

In [116]:
# Extract the messages for further processing
tokens = df['Message'].tolist()

# Initialize the tokenizer
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to tokenize the text (with type checking)
def tokenize_text(tokenizer, tokens):
    aligned_tokens = []
    for word in tokens:
        if isinstance(word, str):  # Ensure that the token is a string
            tokenized_word = tokenizer.tokenize(word)
            aligned_tokens.extend(tokenized_word)
    return aligned_tokens

# Tokenize the text
aligned_tokens = tokenize_text(tokenizer, tokens)

# Display the first 20 tokenized results
print("Aligned Tokens:")
for token in aligned_tokens[:20]:
    display(f"{token}")



Aligned Tokens:


'▁C'

'LAS'

'ICO'

'▁Price'

'▁2200'

'▁No'

'▁gift'

'▁box'

'▁Free'

'▁Delivery'

'▁In'

'box'

'▁Hi'

'we'

'52'

'66'

'▁ስልክ'

'▁25'

'1945'

'355'

In [117]:

# Initialize the Amharic segmenter
sent_punct = []
word_punct = []
segmenter = AmharicSegmenter(sent_punct, word_punct)

# Function to segment tokens for Amharic
# Function to segment Amharic text (with type checking)
def segment_amharic_text(segmenter, tokens):
    segmented_tokens = []
    for word in tokens:
        if isinstance(word, str):  # Ensure the token is a string
            if any(char in word for char in 'አ-ፈ'):
                tokenized_word = segmenter.amharic_tokenizer(word)
            else:
                tokenized_word = word  # Leave non-Amharic words as is
            segmented_tokens.append(tokenized_word)
    return segmented_tokens

# Segment Amharic tokens
segmented_tokens = segment_amharic_text(segmenter, tokens)

# Display the first 20 segmented results
display("Segmented Amharic Tokens:", segmented_tokens[:20])

'Segmented Amharic Tokens:'

['',
 '',
 ['\nCLASICO\n\nPrice',
  '2200',
  'No',
  'gift',
  'box\nFree',
  'Delivery\nInbox',
  'Hiwe5266\nስልክ',
  '251945355266\n\n',
  'ፋሽን',
  'ተራ',
  'Fashion',
  'Tera',
  '\n',
  '\nአድራሻ',
  'አዲስ',
  'አበባ',
  'ጦር',
  'ሀይሎች',
  'ድሪም',
  'ታወር',
  '2ተኛ',
  'ፎቅ\nቢሮ',
  'ቁጥር',
  '205\n'],
 '',
 ['\nPuma',
  '\nMade',
  'in',
  'Vietnam',
  '\n\nSize',
  '404143\nPrice',
  '3600\nFree',
  'Delivery\nInbox',
  'Hiwe5266\nስልክ',
  '251945355266\n\n',
  'ፋሽን',
  'ተራ',
  'Fashion',
  'Tera',
  '\n',
  '\nአድራሻ',
  'አዲስ',
  'አበባ',
  'ጦር',
  'ሀይሎች',
  'ድሪም',
  'ታወር',
  '2ተኛ',
  'ፎቅ\nቢሮ',
  'ቁጥር',
  '205\n'],
 '',
 '',
 '',
 ['New',
  'year',
  'Discount',
  '\n\n',
  '\n\nInbox',
  'Hiwe5266\nስልክ',
  '251945355266\n\n',
  'ፋሽን',
  'ተራ',
  'Fashion',
  'Tera',
  '\n',
  '\nአድራሻ',
  'አዲስ',
  'አበባ',
  'ጦር',
  'ሀይሎች',
  'ድሪም',
  'ታወር',
  '2ተኛ',
  'ፎቅ\nቢሮ',
  'ቁጥር',
  '205\n\n'],
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 '',
 ['\nSkechers',
  '\nMade',
  'in',
  'Vietnam',
  '\n\nSize',
  '

In [118]:
# Save the final tokens to a CSV file
output_df = pd.DataFrame({'Token': segmented_tokens})
output_df.to_csv('C:/Users/Administrator/Documents/kifiya/Week_5/final_telegram_tokens.csv', index=False)

In [119]:

# Display tokens from index 50 to 80
print("Display Results from Index 50 to 80:")
for token in segmented_tokens[50:80]:
    display(f"{token}")


Display Results from Index 50 to 80:


"['\\nNike', 'Air', 'Formula', '23\\nMade', 'in', 'Vietnam', '\\n\\nSize', '4243\\nPrice', '3400\\nFree', 'Delivery\\nInbox', 'Hiwe5266\\nስልክ', '251945355266\\n\\n', 'ፋሽን', 'ተራ', 'Fashion', 'Tera', '\\n', '\\nአድራሻ', 'አዲስ', 'አበባ', 'ጦር', 'ሀይሎች', 'ድሪም', 'ታወር', '2ተኛ', 'ፎቅ\\nቢሮ', 'ቁጥር', '205\\n']"

''

''

"['\\nNike', 'Dunk', 'vs', 'Af1\\nMade', 'in', 'Vietnam', '\\n\\nSize', '4043\\nPrice', '4500\\nFree', 'Delivery\\nInbox', 'Hiwe5266\\nስልክ', '251945355266\\n\\n', 'ፋሽን', 'ተራ', 'Fashion', 'Tera', '\\n', '\\nአድራሻ', 'አዲስ', 'አበባ', 'ጦር', 'ሀይሎች', 'ድሪም', 'ታወር', '2ተኛ', 'ፎቅ\\nቢሮ', 'ቁጥር', '205\\n']"

"['\\nRolex', 'Men', 'Watch', '\\n\\nPrice', '2200', 'No', 'gift', 'box\\nFree', 'Delivery\\nInbox', 'Hiwe5266\\nስልክ', '251945355266\\n\\n', 'ፋሽን', 'ተራ', 'Fashion', 'Tera', '\\n', '\\nአድራሻ', 'አዲስ', 'አበባ', 'ጦር', 'ሀይሎች', 'ድሪም', 'ታወር', '2ተኛ', 'ፎቅ\\nቢሮ', 'ቁጥር', '205\\n']"

''

"['\\nCYBER', 'Men', 'Watch', '\\n\\nPrice', '2500', 'No', 'gift', 'box\\nFree', 'Delivery\\nInbox', 'Hiwe5266\\nስልክ', '251945355266\\n\\n', 'ፋሽን', 'ተራ', 'Fashion', 'Tera', '\\n', '\\nአድራሻ', 'አዲስ', 'አበባ', 'ጦር', 'ሀይሎች', 'ድሪም', 'ታወር', '2ተኛ', 'ፎቅ\\nቢሮ', 'ቁጥር', '205\\n']"

''

''

"['\\nFashion', 'Shose', '\\n\\nSize', '4041\\nPrice', '3200\\nFree', 'Delivery\\nInbox', 'Hiwe5266\\nስልክ', '251945355266\\n\\n', 'ፋሽን', 'ተራ', 'Fashion', 'Tera', '\\n', '\\nአድራሻ', 'አዲስ', 'አበባ', 'ጦር', 'ሀይሎች', 'ድሪም', 'ታወር', '2ተኛ', 'ፎቅ\\nቢሮ', 'ቁጥር', '205\\n']"

"['\\nSpeed', 'Couple', 'watch', '\\n\\nPrices', '3500\\nFree', 'Delivery\\nInbox', 'Hiwe5266\\nስልክ', '251945355266\\n\\n', 'ፋሽን', 'ተራ', 'Fashion', 'Tera', '\\n', '\\nአድራሻ', 'አዲስ', 'አበባ', 'ጦር', 'ሀይሎች', 'ድሪም', 'ታወር', '2ተኛ', 'ፎቅ\\nቢሮ', 'ቁጥር', '205\\n']"

''

"['\\nFashion', 'Shose', '\\n\\nSize', '4243\\nPrice', '2900\\nFree', 'Delivery\\nInbox', 'Hiwe5266\\nስልክ', '251945355266\\n\\n', 'ፋሽን', 'ተራ', 'Fashion', 'Tera', '\\n', '\\nአድራሻ', 'አዲስ', 'አበባ', 'ጦር', 'ሀይሎች', 'ድሪም', 'ታወር', '2ተኛ', 'ፎቅ\\nቢሮ', 'ቁጥር', '205\\n']"

''

''

"['\\nRolex', 'Couple', 'watch', '\\n', '\\nPrices', '3900\\nFree', 'Delivery\\nInbox', 'Hiwe5266\\nስልክ', '251945355266\\n\\n', 'ፋሽን', 'ተራ', 'Fashion', 'Tera', '\\n', '\\nአድራሻ', 'አዲስ', 'አበባ', 'ጦር', 'ሀይሎች', 'ድሪም', 'ታወር', '2ተኛ', 'ፎቅ\\nቢሮ', 'ቁጥር', '205\\n']"

''

''

''

''

"['\\nBalenciaga\\nMade', 'in', 'Vietnam', '\\n\\nSize', '3637383940\\nPrice', '2900\\nFree', 'Delivery\\nInbox', 'Hiwe5266\\nስልክ', '251945355266\\n\\n', 'ፋሽን', 'ተራ', 'Fashion', 'Tera', '\\n', '\\nአድራሻ', 'አዲስ', 'አበባ', 'ጦር', 'ሀይሎች', 'ድሪም', 'ታወር', '2ተኛ', 'ፎቅ\\nቢሮ', 'ቁጥር', '205\\n']"

"['\\nNike', 'Dunk', 'SB\\nMade', 'in', 'Vietnam', '\\n\\nSize', '4243\\nPrice', '3500\\nFree', 'Delivery\\nInbox', 'Hiwe5266\\nስልክ', '251945355266\\n\\n', 'ፋሽን', 'ተራ', 'Fashion', 'Tera', '\\n', '\\nአድራሻ', 'አዲስ', 'አበባ', 'ጦር', 'ሀይሎች', 'ድሪም', 'ታወር', '2ተኛ', 'ፎቅ\\nቢሮ', 'ቁጥር', '205\\n']"

''

"['\\nAdidas', '\\nMade', 'in', 'Vietnam', '\\n\\nSize', '414243\\nPrice', '3400\\nFree', 'Delivery\\nInbox', 'Hiwe5266\\nስልክ', '251945355266\\n\\n', 'ፋሽን', 'ተራ', 'Fashion', 'Tera', '\\n', '\\nአድራሻ', 'አዲስ', 'አበባ', 'ጦር', 'ሀይሎች', 'ድሪም', 'ታወር', '2ተኛ', 'ፎቅ\\nቢሮ', 'ቁጥር', '205\\n']"

''

''

''

''

''

''

In [120]:
# Convert all tokens to strings before writing to the file
tokens = [str(token) for token in tokens]

# Write the tokens to a temporary text file for SentencePiece training
with open('temp_text.txt', 'w', encoding='utf-8') as f:
    f.write("\n".join(tokens))


# Train SentencePiece model with a vocab size <= 952
spm.SentencePieceTrainer.train('--input=temp_text.txt --model_prefix=AmharicSPM --vocab_size=950')

In [121]:
# Initialize SentencePiece tokenizer
tokenizer_spm = spm.SentencePieceProcessor(model_file='AmharicSPM.model')

# Tokenize the input text with SentencePiece
sentencepiece_tokens = tokenizer_spm.encode("ቁ.2 ለቡ መዳህኒዓለም and here is some English text.", out_type=str)

# Print the tokenized words
print("SentencePiece Tokenization Results:")
print(sentencepiece_tokens)

SentencePiece Tokenization Results:
['▁', 'ቁ', '.', '2', '▁', 'ለ', 'ቡ', '▁መ', 'ዳህኒዓ', 'ለ', 'ም', '▁', 'and', '▁', 'he', 're', '▁', 'is', '▁s', 'om', 'e', '▁E', 'ng', 'li', 'sh', '▁', 't', 'ex', 't', '.']


In [122]:


# Tokenize and align using SentencePiece
def tokenize_with_sentencepiece(tokenizer_spm, tokens):
    aligned_tokens = []
    token_ids = []
    for word in tokens:
        tokenized_ids = tokenizer_spm.encode(word, out_type=int)
        tokenized_words = tokenizer_spm.decode(tokenized_ids).split()
        aligned_tokens.extend(tokenized_words)
        token_ids.extend(tokenized_ids)
    return aligned_tokens, token_ids

# Tokenize with SentencePiece
custom_tokens, custom_ids = tokenize_with_sentencepiece(tokenizer_spm, tokens)

# Display results from index 50 to 80
print("Custom Tokenization Results (Index 50 to 80):")
for token, token_id in zip(custom_tokens[50:80], custom_ids[50:80]):
    display(f"{token:<25} {token_id:<10}")


Custom Tokenization Results (Index 50 to 80):


'ሀይሎች                      15        '

'ድሪም                       34        '

'ታወር                       27        '

'2ተኛ                       3         '

'ፎቅ                        35        '

'ቢሮ                        32        '

'ቁጥር                       7         '

'205                       17        '

'New                       22        '

'year                      39        '

'Discount                  44        '

'Inbox                     37        '

'Hiwe5266                  3         '

'ስልክ                       48        '

'251945355266              41        '

'ፋሽን                       42        '

'ተራ                        7         '

'Fashion                   17        '

'Tera                      69        '

'አድራሻ                      89        '

'አዲስ                       18        '

'አበባ                       19        '

'ጦር                        10        '

'ሀይሎች                      43        '

'ድሪም                       33        '

'ታወር                       9         '

'2ተኛ                       96        '

'ፎቅ                        270       '

'ቢሮ                        6         '

'ቁጥር                       503       '