<a href="https://colab.research.google.com/github/tadiwamark/Shona-Language-Model/blob/main/language_modelling_R204445V.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
'''
--------------------------------------------------------------------------
DISCLAIMER:
This model is specifically trained on Jehovah's Witness reading material.
It does not generalize or represent the entire Shona language. Use with
this context in mind.Also note that due to limited computational resources
I ended up using a smaller dataset and a model with less layers as the RAM
usage kept going as high as 52 Gig
--------------------------------------------------------------------------
'''

"\n--------------------------------------------------------------------------\nDISCLAIMER:\nThis model is specifically trained on Jehovah's Witness reading material.\nIt does not generalize or represent the entire Shona language. Use with\nthis context in mind.\n--------------------------------------------------------------------------\n"

In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### 1. Dataset

In [3]:
!pip install PyPDF2



In [4]:
import PyPDF2

def extract_text_from_pdf(pdf_path):
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = "".join([page.extract_text() for page in reader.pages])
    return text


shona_text = extract_text_from_pdf("/content/drive/My Drive/Colab Notebooks/shona_article.pdf")


In [5]:
!pip install stopwordsiso



In [6]:
import stopwordsiso as stopwords

# Get Shona stopwords
shona_stopwords = stopwords.stopwords("sn")

def remove_stopwords(text):
    # Split the text into words
    words = text.split()

    # Remove stopwords
    filtered_words = [word for word in words if word not in shona_stopwords]

    # Join words back into a string
    return ' '.join(filtered_words)

In [7]:
shona_text=remove_stopwords(shona_text)

### 2. Vocabulary

In [8]:
shona_text

'34567NOVEMBER 2023 NYAYA DZEKUDZIDZA JANUARY 8–FEBRUARY 4, 2024MUPROFITA ISAYA akataura kuti Jehovha ‘ndiMwari wecho- kwadi.’ Shoko rakashandurwa kuti “chokwadi” rinoreva kuti“ameni.” (Isa. 65:16, mashoko emuzasi) “Ameni” zvinorevakuti “ngazviitike,” kana kuti “chokwadi.” Shoko rekuti “ameni”parinoshandiswa muBhaibheri richitaura nezvaJehovha kanakuti Jesu, rinoita kuti tive nechokwadi chekuti zvavachataurazvichaitika. Saka zvairehwa naIsaya paaitaura nevaIsraeri nde-zvekuti: Zvese zvinotaurwa naJehovha zvinotoitika. Jehovhaakaratidza izvozvo nekuzadzisa zvese zvaakavimbisa. 2Tinogonawo here kuva nechokwadi chekuti zvatakavimbi- swa naJehovha nezveremangwana zvichazadzika? Anenge ma-kore 800 kubva panguva yakararama Isaya, muapostora Pauroakatsanangura kuti nei zvinovimbiswa naMwari zvichitoitika.Pauro akati: ‘Hazvibviri kuti Mwari areve nhema.’ (VaH. 6:18)Sezvo chitubu chisingakwanisi kubudisa mvura yakanaka pa-mwe chete nemvura ine munyu, Jehovha, Tsime rechokwadi, haakwanisi kutaur

In [9]:

import re
import string

def deep_clean(text):
    # Remove UTF-8 encoding characters
    text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]', ' ', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove repetitive patterns of 'CRCR'
    text = re.sub(r'(CRCRCRCRCR)+', ' ', text)

    # Remove repetitive patterns of 'CRCR'
    text = re.sub(r'(CRCR)+', ' ', text)

    # Remove repetitive patterns of 'CRCR'
    text = re.sub(r'(CR)+', ' ', text)

    # Remove repetitive patterns of 'CRCR'
    text = re.sub(r'(CRCRCR)+', ' ', text)

    # Remove repetitive patterns of 'CRCR'
    text = re.sub(r'(CRCRCRCR)+', ' ', text)

    # Remove punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove extra whitespaces
    text = ' '.join(text.split())

    return text

shona_text = deep_clean(shona_text)


In [10]:
shona_text

'NOVEMBER NYAYA DZEKUDZIDZA JANUARY –FEBRUARY MUPROFITA ISAYA akataura kuti Jehovha ‘ndiMwari wecho kwadi’ Shoko rakashandurwa kuti “chokwadi” rinoreva kuti“ameni” Isa mashoko emuzasi “Ameni” zvinorevakuti “ngazviitike” kana kuti “chokwadi” Shoko rekuti “ameni”parinoshandiswa muBhaibheri richitaura nezvaJehovha kanakuti Jesu rinoita kuti tive nechokwadi chekuti zvavachataurazvichaitika Saka zvairehwa naIsaya paaitaura nevaIsraeri ndezvekuti Zvese zvinotaurwa naJehovha zvinotoitika Jehovhaakaratidza izvozvo nekuzadzisa zvese zvaakavimbisa Tinogonawo here kuva nechokwadi chekuti zvatakavimbi swa naJehovha nezveremangwana zvichazadzika Anenge makore kubva panguva yakararama Isaya muapostora Pauroakatsanangura kuti nei zvinovimbiswa naMwari zvichitoitikaPauro akati ‘Hazvibviri kuti Mwari areve nhema’ VaH Sezvo chitubu chisingakwanisi kubudisa mvura yakanaka pamwe chete nemvura ine munyu Jehovha Tsime rechokwadi haakwanisi kutaura nhema Saka tinogona kuvimba zvizere ne Mashoko emuprofita Is

In [11]:
from keras.preprocessing.text import Tokenizer

# Tokenize and preprocess
tokenizer = Tokenizer()
tokenizer.fit_on_texts([shona_text])
total_words = len(tokenizer.word_index) + 1


### 3. Word Embeddings

In [12]:
from gensim.models import Word2Vec

# Train word2vec model with gensim
sentences = [sentence.split() for sentence in shona_text.split('.')]
model_gensim = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
model_gensim.save("word2vec.model")


### 4. RNN Models

##### With its own embedding layer:

In [17]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

model1 = Sequential()
model1.add(Embedding(total_words, 100, input_length=5))
model1.add(Bidirectional(LSTM(150, return_sequences=True)))
model1.add(Dropout(0.2))
model1.add(LSTM(100))
model1.add(Dense(total_words, activation='softmax'))

model1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


##### With pre-trained embeddings:

In [18]:
# Load the word2vec model
import numpy as np
model_gensim = Word2Vec.load("word2vec.model")
embedding_matrix = np.zeros((total_words, 100))
for word, i in tokenizer.word_index.items():
    try:
        embedding_vector = model_gensim.wv[word]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    except KeyError:
        # Word not present in gensim model, a zero embedding will be used for this word
        pass


model2 = Sequential()
model2.add(Embedding(total_words, 100, weights=[embedding_matrix], input_length=5, trainable=False))
model2.add(Bidirectional(LSTM(150, return_sequences=True)))
model2.add(Dropout(0.2))
model2.add(LSTM(100))
model2.add(Dense(total_words, activation='softmax'))

model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


### 5. Training & Testing

In [19]:
import keras
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

input_sequences = []
for line in shona_text.split('.'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

input_sequences = np.array(pad_sequences(input_sequences, maxlen=6, padding='pre'))

# Set batch size and calculate the number of batches
batch_size = 1000
num_batches = len(input_sequences) // batch_size + (len(input_sequences) % batch_size != 0)

# Loop over batches and train the model on each batch
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = start_idx + batch_size
    batch_sequences = input_sequences[start_idx:end_idx]

    X_batch, y_batch = batch_sequences[:, :-1], batch_sequences[:, -1]
    y_batch = keras.utils.to_categorical(y_batch, num_classes=total_words)

    X_train_batch, X_val_batch, y_train_batch, y_val_batch = train_test_split(X_batch, y_batch, test_size=0.2, random_state=42)

    print(f"Training on batch {i+1}/{num_batches}")
    # Training Model 1
    history1 = model1.fit(X_train_batch, y_train_batch, validation_data=(X_val_batch, y_val_batch), epochs=100, verbose=1)

    # Training Model 2
    history2 = model2.fit(X_train_batch, y_train_batch, validation_data=(X_val_batch, y_val_batch), epochs=100, verbose=1)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
E

### Model Evaluation

In [23]:
val_loss_model1 = history1.history['val_loss'][-1]
val_loss_model2 = history2.history['val_loss'][-1]

print(f"Validation Loss for Model 1: {val_loss_model1}")
print(f"Validation Loss for Model 2: {val_loss_model2}")


Validation Loss for Model 1: 8.890104293823242
Validation Loss for Model 2: 11.863685607910156


In [25]:
if val_loss_model1 < val_loss_model2:
    best_model = model1
    best_model_name = "best_model1.h5"
else:
    best_model = model2
    best_model_name = "best_model2.h5"

best_model.save(best_model_name)
print(f"Saved the best model as {best_model_name}")


Saved the best model as best_model1.h5


In [27]:
from google.colab import files
files.download('best_model1.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [22]:
params_model1 = model1.count_params()
params_model2 = model2.count_params()

print(f"Model 1 has {params_model1} parameters.")
print(f"Model 2 has {params_model2} parameters.")


Model 1 has 3169070 parameters.
Model 2 has 3169070 parameters.


In [None]:
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

# Load the previously saved model
model = load_model('best_model1.h5')

def predict_next_words(model, tokenizer, text, num_words=1):
    """
    Predict the next set of words using the trained model.

    Args:
    - model (keras.Model): The trained model.
    - tokenizer (Tokenizer): The tokenizer object used for preprocessing.
    - text (str): The input text.
    - num_words (int): The number of words to predict.

    Returns:
    - str: The predicted words.
    """
    for _ in range(num_words):
        # Tokenize and pad the text
        sequence = tokenizer.texts_to_sequences([text])[0]
        sequence = pad_sequences([sequence], maxlen=5, padding='pre')

        # Predict the next word
        predicted_probs = model.predict(sequence, verbose=0)
        predicted = np.argmax(predicted_probs, axis=-1)

        # Convert the predicted word index to a word
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break

        # Append the predicted word to the text
        text += " " + output_word

    return ' '.join(text.split(' ')[-num_words:])


# Prompt the user for input
user_input = input("Please type five words in Shona: ")

# Predict the next words
predicted_words = predict_next_words(model, tokenizer, user_input, num_words=3)
print(f"The next words might be: {predicted_words}")


Please type five words in Shona: Makadini zvenyu vanofara here vamwe
The next words might be: kataura kuti urambe
