In [1]:
import os, sys

#move notebook to root folder
if os.getcwd().endswith("notebooks"):
    os.chdir("..")

print("Working with directory:", os.getcwd())

#add src to python path

sys.path.append(os.path.join(os.getcwd(), "src"))
print("Python path updated")


Working with directory: /Users/mac/ML_PLAYGROUND/grant-ml
Python path updated


In [2]:
#load preprocessing pipeline

from preprocessing.full_preprocessor import build_preprocessor


numeric_cols = [
    "past_grants_total", "annual_budget", "total_revenue",
    "annual_budget_last_year", "cash_reserves",
    "monthly_operating_expenses", "years_active"
]

text_cols = [
    "org_background", "project_description",
    "track_record", "mission_statement"
]

preprocessor = build_preprocessor(numeric_cols,text_cols)
preprocessor



0,1,2
,transformers,"[('numeric', ...), ('ratios', ...), ...]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,numeric_columns,"['past_grants_total', 'annual_budget', ...]"

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,text_columns,"['org_background', 'project_description', ...]"

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'


In [3]:
import pandas as pd

df = pd.read_csv("data/raw/grants_raw.csv")
df.head()

Unnamed: 0,past_grants_total,annual_budget,total_revenue,annual_budget_last_year,cash_reserves,monthly_operating_expenses,years_active,org_background,project_description,track_record,mission_statement,suitability_class
0,"$500,000","KES 3,000,000",4500000,"KES 2,500,000","KES 90,000","KES 60,000",4,Community-based organisation supporting fish f...,Strengthen post-harvest handling and reduce lo...,Implemented youth and BMU training.,Empower coastal communities through aquaculture.,High
1,250000,1500000,"KES 1,800,000",1200000,45000,35000,3,Women-led fisheries business support.,Training women on improved fish processing.,Delivered women empowerment projects.,Enhance women’s participation in fisheries.,Medium
2,100000,"KES 900,000",1200000,800000,30000,20000,2,Youth livelihood support group.,Starting small tilapia demonstration ponds.,Delivered youth programmes.,Support youth livelihoods.,Low


In [4]:
Xt = preprocessor.fit_transform(df[numeric_cols + text_cols])
Xt.shape



(3, 110)

In [7]:
import pandas as pd

df = pd.DataFrame({
    "past_grants_total": [100000, 200000, 50000, 400000],
    "annual_budget": [500000, 800000, 200000, 1000000],
    "total_revenue": [600000, 900000, 300000, 1200000],
    "annual_budget_last_year": [450000, 780000, 150000, 950000],
    "cash_reserves": [100000, 150000, 50000, 200000],
    "monthly_operating_expenses": [40000, 70000, 30000, 90000],
    "years_active": [5, 10, 3, 12],

    "org_background": [
        "We work with women in fisheries communities",
        "Climate-smart agriculture for youth farmers",
        "Empowering vulnerable groups across Kenya",
        "Large-scale resilience and blue economy program"
    ],
    "project_description": [
        "This proposal focuses on sustainable fisheries training",
        "Proposal aims to improve agriculture productivity",
        "Community-led empowerment project",
        "Market systems and blue economy growth"
    ],
    "track_record": [
        "5 years training communities",
        "Strong record in agriculture",
        "Worked in 12 counties",
        "Blue economy projects for 10 years"
    ],
    "mission_statement": [
        "Women empowerment",
        "Climate resilience",
        "Community empowerment",
        "Blue economy development"
    ],

    # TARGET variable
    "suitability_class": ["high", "medium", "low", "high"]
})

df.to_csv("data/train.csv", index=False)
df.head()




Unnamed: 0,past_grants_total,annual_budget,total_revenue,annual_budget_last_year,cash_reserves,monthly_operating_expenses,years_active,org_background,project_description,track_record,mission_statement,suitability_class
0,100000,500000,600000,450000,100000,40000,5,We work with women in fisheries communities,This proposal focuses on sustainable fisheries...,5 years training communities,Women empowerment,high
1,200000,800000,900000,780000,150000,70000,10,Climate-smart agriculture for youth farmers,Proposal aims to improve agriculture productivity,Strong record in agriculture,Climate resilience,medium
2,50000,200000,300000,150000,50000,30000,3,Empowering vulnerable groups across Kenya,Community-led empowerment project,Worked in 12 counties,Community empowerment,low
3,400000,1000000,1200000,950000,200000,90000,12,Large-scale resilience and blue economy program,Market systems and blue economy growth,Blue economy projects for 10 years,Blue economy development,high


In [8]:
import pandas as pd
import numpy as np
import random

# --------------------------------------------
# Helper functions
# --------------------------------------------

def random_text(topics, extra=""):
    """Generate simple thematic sentences."""
    parts = random.sample(topics, 2)
    return f"{parts[0]} {parts[1]} {extra}".strip()


# --------------------------------------------
# THEMES/TOPICS FOR TEXT FIELDS
# --------------------------------------------

topics_high = [
    "women empowerment", "fisheries sustainability", "blue economy",
    "community training", "market systems", "resilience",
    "strong governance", "scalable model", "evidence based"
]

topics_medium = [
    "youth training", "agriculture support", "local markets",
    "capacity building", "pilot project", "moderate experience",
    "county partnerships", "community outreach"
]

topics_low = [
    "limited experience", "small community group", "unstructured approach",
    "basic proposal", "unclear outcomes", "no prior grant",
    "weak systems", "minimal accountability"
]


# --------------------------------------------
# Generate Numeric Features
# --------------------------------------------

def generate_numeric_profile(level):
    """Return realistic numeric features depending on suitability level."""
    
    if level == "high":
        return {
            "past_grants_total": random.randint(300000, 3000000),
            "annual_budget": random.randint(500000, 5000000),
            "total_revenue": random.randint(600000, 6000000),
            "annual_budget_last_year": random.randint(500000, 5000000),
            "cash_reserves": random.randint(100000, 1000000),
            "monthly_operating_expenses": random.randint(50000, 500000),
            "years_active": random.randint(6, 20)
        }
    
    if level == "medium":
        return {
            "past_grants_total": random.randint(50000, 500000),
            "annual_budget": random.randint(200000, 1000000),
            "total_revenue": random.randint(250000, 1200000),
            "annual_budget_last_year": random.randint(200000, 1000000),
            "cash_reserves": random.randint(20000, 200000),
            "monthly_operating_expenses": random.randint(20000, 150000),
            "years_active": random.randint(3, 10)
        }
    
    if level == "low":
        return {
            "past_grants_total": random.randint(0, 100000),
            "annual_budget": random.randint(50000, 200000),
            "total_revenue": random.randint(60000, 250000),
            "annual_budget_last_year": random.randint(40000, 200000),
            "cash_reserves": random.randint(5000, 40000),
            "monthly_operating_expenses": random.randint(10000, 50000),
            "years_active": random.randint(1, 5)
        }


# --------------------------------------------
# Generate Text Fields
# --------------------------------------------

def generate_text_profile(level):
    if level == "high":
        topics = topics_high
    elif level == "medium":
        topics = topics_medium
    else:
        topics = topics_low

    return {
        "org_background": random_text(topics),
        "project_description": random_text(topics, extra="project"),
        "track_record": random_text(topics, extra="experience"),
        "mission_statement": random_text(topics, extra="")
    }


# --------------------------------------------
# Generate Final DataFrame
# --------------------------------------------

data = []

levels = ["high", "medium", "low"]

for level in levels:
    for _ in range(30):  # 30 samples per class → balanced 90-row dataset
        numeric = generate_numeric_profile(level)
        text = generate_text_profile(level)
        row = {**numeric, **text, "suitability_class": level}
        data.append(row)

df = pd.DataFrame(data)

# Shuffle dataset
df = df.sample(frac=1).reset_index(drop=True)

# Save
df.to_csv("data/train.csv", index=False)

df.head(), df["suitability_class"].value_counts()


(   past_grants_total  annual_budget  total_revenue  annual_budget_last_year  \
 0             268006         243904         922807                   763538   
 1              52822         310032         427295                   624929   
 2              39839         180979         152405                   104941   
 3             194475         789979         741731                   659107   
 4              58236         143429         232839                   106543   
 
    cash_reserves  monthly_operating_expenses  years_active  \
 0          60099                      132932             7   
 1         115134                       42136            10   
 2          11651                       13734             5   
 3          76957                      133212             8   
 4          32002                       13174             3   
 
                              org_background  \
 0         youth training community outreach   
 1           local markets capacity buildi