In [1]:
import re

## Fetch the data from MongoDB database

In [2]:
from src.mongo_db import get_collection


In [3]:
filter = {"category": "text"}
swahili_collection = get_collection('web_scraper', 'swahili_text')

In [4]:
def get_data():
    swa_docs = swahili_collection.find({"category": "text"})
    return list(swa_docs)

In [5]:
docs_list = get_data()

In [6]:
content_nested_list = [doc.get("content") for doc in docs_list]

In [38]:
content_list = [item for sublist in docs_list for item in sublist.get("content", [])]

In [11]:
content_list

['<p>\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0 \xa0\xa0\xa0\xa0\r\n\xa0<img border="0" height="82" src="coconut.gif" width="70"/> <img border="0" height="50" src="moto.gif" width="200"/><img border="0" height="82" src="coconut.gif" width="70"/> </p>',
 '<p style="margin-top: 0; margin-bottom: 0">\n<a href="index.htm">HOME</a></p>',
 '<p style="margin-top: 0; margin-bottom: 0"><a href="methali.htm">\n<font size="2">PROVERBS</font></a></p>',
 '<p style="margin-top: 0; margin-bottom: 0"><a href="dishes.htm">\n<font size="2">CUISINE</font></a></p>',
 '<p style="margin-top: 0; margin-bottom: 0"><a href="tarab.htm">\n<font size="2">TARAAB</font></a></p>',
 '<p style="margin-top: 0; margin-bottom: 0"><a href="mashairi.htm">\n<font size="2">MASHAIRI</font></a></p>',
 '<p style="margin-top: 0; margin-bottom: 0"><a href="dua.htm"><font size="2">\r\nDUA</font></a></p>',
 '<p style="margin-top: 0; margin-bottom: 0">

In [14]:
text = content_list[-1]

In [15]:
text

'<p><font face="Courier New" size="2"><br/>\r\n          Mtoto mwenye kuchezea kivuli chake cha taa<br/>\r\n             atakojoa kitandani.<br/>\r\n               A child who plays with his shadow cast by a<br/>\r\n               lamp will wet his bed.<br/>\r\n          Kuuza chumvi usiku ni nuksi kwa mwenye duka.<br/>\r\n               Selling salt at night brings ill luck to the<br/>\r\n               shopkeeper. (This belief is so deep-rooted) <br/>\n</font><!--msnavigation--></p>'

## Preprocess the data to ensure high quality for NLP

### Define the functions to clean the data

In [16]:
def remove_html_tags(string):
    clean = re.compile('<.*?>')
    cleantext = re.sub(clean, '', string)
    return cleantext

def remove_special_characters(string):
    return ''.join(letter for letter in string if letter.isalnum() or letter == ' ')

def to_lower_case(string):
    return string.lower()

def remove_numbers(string):
    return re.sub(r'\d+', '', string)

def remove_punctuation(string):
    return re.sub(r'[^\w\s]', '', string)

def remove_white_space(string):
    return string.strip()

def preprocess(string):
    string = remove_html_tags(string)
    string = remove_special_characters(string)
    string = to_lower_case(string)
    string = remove_numbers(string)
    string = remove_punctuation(string)
    string = remove_white_space(string)
    return string

In [39]:
preprocess(text)

'mtoto mwenye kuchezea kivuli chake cha taa             atakojoa kitandani               a child who plays with his shadow cast by a               lamp will wet his bed          kuuza chumvi usiku ni nuksi kwa mwenye duka               selling salt at night brings ill luck to the               shopkeeper this belief is so deeprooted'

In [18]:
cleaned_content_list = [preprocess(text) for text in content_list]

In [31]:
cleaned_content_list

['',
 'home',
 'proverbs',
 'cuisine',
 'taraab',
 'mashairi',
 'dua',
 'riddles',
 'pictures',
 'taraab lyrics',
 'superstitions',
 'visa na mikasa',
 'music',
 'articles',
 'qasweeda',
 'superstitions          kula gizani ni kula na shetani              to eat in the dark is to eat with the devil              this belief discourages eating in the dark for              obvious reasons          kushona nguo mwilini kunaleta ufukara tabia            hii aidha inajongeza kifo maana inakuwa kama            mtu anajishonea sanda yake mwenyewe              to mend a garment which is on ones body              brings poverty it also suggests preparing              ones own shroud which betokens death          kufagia usiku kunakimbiza baraka              to sweep at night is to chase away blessings              this belief contains some truth in that one              may sweep up and throw away something of              value unwittinglykutembea hali ya kuwa mtu amevaa kiatu            kimoja

## Save the cleaned data back to the database Mongodb

In [35]:
new_values = {"$set": {"cleaned_content": cleaned_content_list}}

In [36]:
swahili_collection.update_one(filter, new_values)

UpdateResult({'n': 1, 'nModified': 0, 'ok': 1.0, 'updatedExisting': True}, acknowledged=True)