In [2]:
# import nltk stopwords
from nltk.corpus import stopwords
# Tokenize text into words and remove English stopwords
from nltk.tokenize import word_tokenize

## Remove english words

### Method 1 - Using NLTK

In [10]:

text = "Huyu jamaa anaongea this is the work of the guy in the pub Kiswahili vizuri sana, lakini pia anajua Kiingereza."

In [4]:
import nltk

# Download necessary NLTK resources if you haven't already
nltk.download('stopwords')
nltk.download('punkt')

def extract_swahili_words_nltk(text):
    words = word_tokenize(text.lower())
    swahili_words = [word for word in words if word not in stopwords.words('english')]
    return " ".join(swahili_words)

filtered_text = extract_swahili_words_nltk(text)
print(filtered_text)  


huyu jamaa anaongea work guy pub kiswahili vizuri sana , lakini pia anajua kiingereza .


[nltk_data] Downloading package stopwords to /home/hilla/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/hilla/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Method 2 - Using langdetect

In [5]:
import langdetect  # pip install langdetect

def extract_swahili_words_langdetect(text):
    words = text.split()
    swahili_words = [word for word in words if langdetect.detect(word) == 'sw']
    return " ".join(swahili_words)

text = "Huyu jamaa anaongea Kiswahili vizuri sana, lakini pia anajua Kiingereza."
filtered_text = extract_swahili_words_langdetect(text)
print(filtered_text)


Kiswahili lakini Kiingereza.


### Method 3 - Using langid

In [7]:
# pip install langid
import langid

def extract_swahili_words_langid(text):
    words = text.split()
    swahili_words = [word for word in words if langid.classify(word)[0] == 'sw']
    return " ".join(swahili_words)

text = "Huyu jamaa anaongea Kiswahili vizuri sana, lakini pia anajua Kiingereza."
filtered_text = extract_swahili_words_langid(text)
print(filtered_text)




### Method 4 - Using fasttext

In [9]:
# pip install fasttext
import fasttext

def extract_swahili_words_fasttext(text):
    words = text.split()
    swahili_words = [word for word in words if fasttext.load_model('lid.176.bin').predict(word)[0][0] == '__label__sw']
    return " ".join(swahili_words)

text = "Huyu jamaa anaongea Kiswahili vizuri sana, lakini pia anajua Kiingereza."
filtered_text = extract_swahili_words_fasttext(text)
print(filtered_text)




ValueError: lid.176.bin cannot be opened for loading!

In [11]:
# Step 1: Install the langdetect library
# You can do this in your terminal using the command: pip install langdetect

# Step 2: Import the detect function
from langdetect import detect

# Step 3: Create a function to detect if a string is in English
def is_english(s):
    try:
        return detect(s) == 'en'
    except:
        return False

# Step 4: Use a list comprehension to filter your data
english_data = [s for s in text.split() if is_english(s)]

In [12]:
english_data

['this', 'the', 'of', 'the', 'the']

In [14]:
import langdetect

def filter_swahili_text(text_list, confidence_threshold=0.8):
    swahili_texts = []
    english_rich_texts = []
    for text in text_list:
        try:
            lang = langdetect.detect(text)
            if lang == 'sw' and langdetect.detect_langs(text)[0].prob > confidence_threshold:
                swahili_texts.append(text)
            else:
                english_rich_texts.append(text)
        except langdetect.LangDetectException:
            pass # Skip texts that can't be detected 
    return swahili_texts, english_rich_texts

# Example usage
text_list = ["Habari za leo?", "This is an English sentence.", "Ninaongea na naenda work to stay in Kiswahili na Kiingereza."]
swahili_texts, english_rich_texts = filter_swahili_text(text_list)
print(swahili_texts)  
print(english_rich_texts)  


['Habari za leo?', 'Ninaongea na naenda work to stay in Kiswahili na Kiingereza.']
['This is an English sentence.']


# Get all the english words from the web2 dictionary and remove them from the swahili text

In [4]:
from english_words import get_english_words_set

def remove_english_words(swahili_text):
    web2lowerset = get_english_words_set(['web2'], lower=True)

    swahili_words = swahili_text.split()
    filtered_words = [word for word in swahili_words if word.lower() not in web2lowerset]
    return " ".join(filtered_words)

# Example usage
swahili_text_with_english = "Huyu jamaa anaongea this Kiswahili and him vizuri sana, why lakini pia anajua Kiingereza."
filtered_swahili_text = remove_english_words(swahili_text_with_english)
print(filtered_swahili_text)


Huyu jamaa anaongea vizuri sana, lakini anajua Kiingereza.
