In [83]:
import pandas as pd
import numpy as np
import re
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import joblib

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

In [84]:
# Load dataset
df = pd.read_csv('diseases.csv')  # columns: 'description', 'disease'

# Verify data
print(f"Original dataset size: {len(df)}")
print("Sample data:")
print(df.sample(3, random_state=42))

Original dataset size: 374
Sample data:
                  description  disease
329      Joto linaniua kabisa  Malaria
33       Mkojo unanuka vibaya      UTI
15   Kutapika na kichefuchefu  Malaria


In [85]:
import random
import re
from functools import reduce

def swahili_spelling_variation(word):
    """Generate common spelling variations for a Swahili word"""
    variations = [word]
    
    # Common Swahili character-level transformations
    transformations = [
        # Vowel substitutions
        lambda s: s.replace('a', random.choice(['aa', 'e', '']), 1) if 'a' in s else s,
        lambda s: s.replace('e', random.choice(['i', 'a', 'ie']), 1) if 'e' in s else s,
        lambda s: s.replace('i', random.choice(['e', 'ee', 'y']), 1) if 'i' in s else s,
        lambda s: s.replace('u', random.choice(['oo', 'w', 'uu']), 1) if 'u' in s else s,
        lambda s: s.replace('o', random.choice(['a', 'ou', 'oo']), 1) if 'o' in s else s,
        
        # Consonant substitutions
        lambda s: s.replace('k', random.choice(['c', 'ch', '']), 1) if 'k' in s else s,
        lambda s: s.replace('ch', random.choice(['sh', 'c', 'ky']), 1) if 'ch' in s else s,
        lambda s: s.replace('sh', random.choice(['ch', 's', 'sy']), 1) if 'sh' in s else s,
        lambda s: s.replace('m', random.choice(['n', 'mb', 'mm']), 1) if 'm' in s else s,
        lambda s: s.replace('n', random.choice(['m', 'ny', 'nn']), 1) if 'n' in s else s,
        lambda s: s.replace('b', random.choice(['p', 'v', 'bh']), 1) if 'b' in s else s,
        lambda s: s.replace('p', random.choice(['b', 'ph', '']), 1) if 'p' in s else s,
        lambda s: s.replace('v', random.choice(['b', 'f', 'vh']), 1) if 'v' in s else s,
        lambda s: s.replace('f', random.choice(['v', 'p', 'ph']), 1) if 'f' in s else s,
        lambda s: s.replace('d', random.choice(['t', 'dh', '']), 1) if 'd' in s else s,
        lambda s: s.replace('t', random.choice(['d', 'th', '']), 1) if 't' in s else s,
        
        # Common Swahili-specific transformations
        lambda s: s.replace('ny', random.choice(['n', 'ni', 'gn']), 1) if 'ny' in s else s,
        lambda s: s.replace('ng', random.choice(['n', 'g', 'nk']), 1) if 'ng' in s else s,
        lambda s: s.replace('gh', random.choice(['g', 'h', '']), 1) if 'gh' in s else s,
        lambda s: s.replace('th', random.choice(['t', 's', '']), 1) if 'th' in s else s,
        lambda s: s.replace('dh', random.choice(['d', 'z', '']), 1) if 'dh' in s else s,
        
        # Common Swahili prefix transformations
        lambda s: re.sub(r'^(ni)(\w+)', lambda m: random.choice(['n', 'mi', 'm'])+m.group(2), s) 
                  if s.startswith('ni') else s,
        lambda s: re.sub(r'^(na)(\w+)', lambda m: random.choice(['n', 'a', 'ma'])+m.group(2), s) 
                  if s.startswith('na') else s,
        lambda s: re.sub(r'^(ku)(\w+)', lambda m: random.choice(['kw', 'ko', 'k'])+m.group(2), s) 
                  if s.startswith('ku') else s,
        lambda s: re.sub(r'^(ki)(\w+)', lambda m: random.choice(['ch', 'ky', 'c'])+m.group(2), s) 
                  if s.startswith('ki') else s,
        lambda s: re.sub(r'^(vi)(\w+)', lambda m: random.choice(['vy', 'v', 'fi'])+m.group(2), s) 
                  if s.startswith('vi') else s,
        
        # Random mutations
        lambda s: s[:random.randint(1, len(s))] + random.choice(['a', 'e', 'i', 'o', 'u']) + s[random.randint(1, len(s)):] 
                  if len(s) > 3 else s,
        lambda s: s[:random.randint(1, len(s)-1)] + s[random.randint(1, len(s)-1)+1:] 
                  if len(s) > 3 else s,
        lambda s: s[:random.randint(1, len(s)-1)] + random.choice(['h', 'm', 'n']) + s[random.randint(1, len(s)-1):] 
                  if len(s) > 3 else s,
    ]
    
    # Apply transformations randomly
    for _ in range(random.randint(1, 3)):  # Apply 1-3 transformations
        if random.random() < 0.7:  # 70% chance to apply a transformation
            transform = random.choice(transformations)
            new_variant = transform(word)
            if new_variant != word:
                variations.append(new_variant)
    
    return list(set(variations))  # Remove duplicates

def augment_text(text, num_variations=5):
    """Generate augmented versions of a text"""
    words = text.split()
    augmented_texts = [text]  # Start with original text
    
    for _ in range(num_variations):
        new_words = []
        for word in words:
            # Get variations for each word
            word_variations = swahili_spelling_variation(word)
            # Choose one variation randomly
            new_words.append(random.choice(word_variations))
        
        # Combine words to form new text
        new_text = ' '.join(new_words)
        if new_text != text:
            augmented_texts.append(new_text)
    
    return list(set(augmented_texts))  # Remove duplicates


In [86]:
def augment_dataset(df, variations_per_sample=500):
    """Create augmented dataset with automatic spelling variations"""
    augmented_data = []
    
    for _, row in df.iterrows():
        original_text = row['description']
        variations = augment_text(original_text, variations_per_sample)
        
        for text in variations:
            augmented_data.append({
                'description': text,
                'disease': row['disease'],
                'is_original': (text == original_text)
            })
    
    return pd.DataFrame(augmented_data)

# Create augmented dataset
augmented_df = augment_dataset(df, variations_per_sample=10)

print(f"Original dataset size: {len(df)}")
print(f"Augmented dataset size: {len(augmented_df)}")
print(f"Number of variations per sample: {len(augmented_df)/len(df):.1f}x")

Original dataset size: 374
Augmented dataset size: 2060
Number of variations per sample: 5.5x


In [87]:
import re

def syllable_tokenizer(text):
    vowels = "aeiou"
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)  # remove punctuation
    words = text.split()
    syllables = []

    for word in words:
        i = 0
        w_len = len(word)
        word_syllables = []
        while i < w_len:
            # If it's a vowel, it's a syllable
            if word[i] in vowels:
                if i + 1 < w_len and word[i + 1] in vowels:
                    word_syllables.append(word[i])
                    i += 1
                else:
                    word_syllables.append(word[i])
                    i += 1
                continue

            # Consonant + Vowel
            if i + 1 < w_len and word[i+1] in vowels:
                # Check for a nasal+consonant like 'mw', 'ny'
                if word[i:i+2] in ['mw', 'ny', 'ng', 'nd', 'mb', 'nj', 'sy', 'sh']:
                    if i + 2 < w_len and word[i+2] in vowels:
                        word_syllables.append(word[i:i+3])
                        i += 3
                        continue
                word_syllables.append(word[i:i+2])
                i += 2
                continue

            # Consonant + Vowel + Consonant (like mwa)
            if i + 2 < w_len and word[i+1] in vowels and word[i+2] not in vowels:
                word_syllables.append(word[i:i+3])
                i += 3
                continue

            # Default to one letter (backup case)
            word_syllables.append(word[i])
            i += 1

        syllables.extend(word_syllables)

    return syllables


In [88]:
test_text = "naumwa kichwa na ninahisi kichwani"
print("\nImproved Syllable tokenization:")
print(test_text)
print("→", syllable_tokenizer(test_text))
vectorizer = TfidfVectorizer(
    tokenizer=syllable_tokenizer,
    lowercase=True
)


Improved Syllable tokenization:
naumwa kichwa na ninahisi kichwani
→ ['na', 'u', 'm', 'wa', 'ki', 'c', 'h', 'wa', 'na', 'ni', 'na', 'hi', 'si', 'ki', 'c', 'h', 'wa', 'ni']


In [89]:
# Split augmented data
X = augmented_df['description']
y = augmented_df['disease']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y
)

print(f"\nTraining samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")


Training samples: 1648
Test samples: 412


In [90]:
# Create pipeline with syllable-level features
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', CalibratedClassifierCV(
        LinearSVC(
            class_weight='balanced',
            max_iter=1000,
            random_state=42
        ),
        n_jobs=-1
    ))
])

# Train model
print("\nTraining model...")
pipeline.fit(X_train, y_train)

# Evaluate
y_pred = pipeline.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save model
joblib.dump(pipeline, 'swahili_disease_predictor_syllable.pkl')


Training model...





Classification Report:
              precision    recall  f1-score   support

     Malaria       0.81      0.85      0.83       136
     Typhoid       0.87      0.82      0.84       127
         UTI       0.88      0.87      0.88       149

    accuracy                           0.85       412
   macro avg       0.85      0.85      0.85       412
weighted avg       0.85      0.85      0.85       412



['swahili_disease_predictor_syllable.pkl']

In [None]:
disease_model = joblib.load('swahili_disease_predictor_syllable.pkl')

In [91]:
def predict_with_confidence(text, threshold=0.6):
    """Predict disease with syllable pattern recognition"""
    try:
        # Get probabilities
        proba = disease_model.predict_proba([text])[0]
        classes = disease_model.classes_
        
        # Get top prediction
        max_idx = np.argmax(proba)
        top_class = classes[max_idx]
        top_prob = proba[max_idx]
        
        # Check confidence threshold
        if top_prob >= threshold:
            return {
                'disease': top_class,
                'confidence': float(top_prob),
                'message': f"Ugonjwa unaotambuliwa: {top_class} (uhakiki: {top_prob:.1%})",
                'syllables': syllable_tokenizer(text)
            }
        else:
            return {
                'disease': None,
                'confidence': float(top_prob),
                'message': "Haujatambuliwa ugonjwa wowote kutokana na maelezo yako.",
                'syllables': syllable_tokenizer(text)
            }
    except Exception as e:
        return {
            'disease': None,
            'confidence': 0.0,
            'message': f"Hitilafu katika uchambuzi: {str(e)}",
            'syllables': []
        }

# Test prediction
test_phrases = [
    "headache",
    "ninaumwa kichwani",
    "nahisi kizunguzung",
    "sina njaa leo",
    "kukojoa kila baada ya dakika tano"
]

print("\nPrediction Examples:")
for phrase in test_phrases:
    result = predict_with_confidence(phrase)
    print(f"\nInput: '{phrase}'")
    print(f"Syllables: {result['syllables']}")
    print(f"Result: {result['message']}")
    print(f"Confidence: {result['confidence']:.2f}")


Prediction Examples:

Input: 'headache'
Syllables: ['he', 'a', 'da', 'c', 'he']
Result: Ugonjwa unaotambuliwa: Malaria (uhakiki: 79.8%)
Confidence: 0.80

Input: 'ninaumwa kichwani'
Syllables: ['ni', 'na', 'u', 'm', 'wa', 'ki', 'c', 'h', 'wa', 'ni']
Result: Ugonjwa unaotambuliwa: UTI (uhakiki: 64.6%)
Confidence: 0.65

Input: 'nahisi kizunguzung'
Syllables: ['na', 'hi', 'si', 'ki', 'zu', 'n', 'gu', 'zu', 'n', 'g']
Result: Ugonjwa unaotambuliwa: Malaria (uhakiki: 97.7%)
Confidence: 0.98

Input: 'sina njaa leo'
Syllables: ['si', 'na', 'n', 'ja', 'a', 'le', 'o']
Result: Ugonjwa unaotambuliwa: Typhoid (uhakiki: 80.4%)
Confidence: 0.80

Input: 'kukojoa kila baada ya dakika tano'
Syllables: ['ku', 'ko', 'jo', 'a', 'ki', 'la', 'ba', 'a', 'da', 'ya', 'da', 'ki', 'ka', 'ta', 'no']
Result: Ugonjwa unaotambuliwa: UTI (uhakiki: 91.8%)
Confidence: 0.92
