## Exploring raw data

### scraped tenders from 2merkato.com

In [11]:
import pandas as pd
import csv
import sys

chunksize = 50_000  # Read in 50k rows at a time
csv.field_size_limit(sys.maxsize)

9223372036854775807

In [4]:
raw_csv = "../data/raw/tenders_2merkato.csv"

# Read CSV in chunks
chunk_iter = pd.read_csv(
    raw_csv,
    chunksize=chunksize,
    on_bad_lines="skip",
    engine="python"
)

# process chunks one by one
for i, chunk in enumerate(chunk_iter, start=1):
    print(f"--- Chunk {i} ---")
    print("Shape:", chunk.shape)

--- Chunk 1 ---
Shape: (49999, 9)
--- Chunk 2 ---
Shape: (50000, 9)
--- Chunk 3 ---
Shape: (50000, 9)
--- Chunk 4 ---
Shape: (50000, 9)
--- Chunk 5 ---
Shape: (50000, 9)
--- Chunk 6 ---
Shape: (50000, 9)
--- Chunk 7 ---
Shape: (41488, 9)


## Exploring cleaned data

### cleaning tenders data from 2merkato.com

In [6]:
cleaned_csv = "../data/processed/tenders_english_2merkato.csv"

# Placeholders for summary stats
missing_values = None
length_summaries = []

chunk_iter = pd.read_csv(
    cleaned_csv,
    chunksize=chunksize,
    engine="python",
    on_bad_lines="skip"
)

for i, chunk in enumerate(chunk_iter, start=1):
    print(f"\n--- Processing Chunk {i} ---")
    print("Shape:", chunk.shape)
    print("Columns:", chunk.columns.tolist())

    # Peek at first rows (only for first chunk)
    if i == 1:
        print("\nSample rows:")
        print(chunk[["Title","Title_clean", "Description_clean"]].head(5))

    # Check missing values per chunk
    chunk_missing = chunk.isna().sum()
    if missing_values is None:
        missing_values = chunk_missing
    else:
        missing_values += chunk_missing

    # Check average lengths (before vs after cleaning)
    chunk["title_len_raw"] = chunk["Title"].astype(str).str.len()
    chunk["title_len_clean"] = chunk["Title_clean"].astype(str).str.len()
    chunk["desc_len_raw"] = chunk["Description"].astype(str).str.len()
    chunk["desc_len_clean"] = chunk["Description_clean"].astype(str).str.len()

    length_summaries.append(chunk[["title_len_raw", "title_len_clean", "desc_len_raw", "desc_len_clean"]].mean())

    # Spot-check random cleaned samples (only first chunk for preview)
    if i == 1:
        print("\nRandom sample of cleaned text:")
        print(chunk[["Title_clean", "Description_clean"]].sample(5, random_state=42))

# Combine averages across chunks
avg_lengths = pd.DataFrame(length_summaries).mean()

print("\n==== Final Results ====")
print("Missing values:")
print(missing_values)

print("\nAverage lengths (raw vs cleaned):")
print(avg_lengths)


--- Processing Chunk 1 ---
Shape: (50000, 12)
Columns: ['Title', 'URL', 'Closing Date', 'Published On', 'Region', 'Bidding Status', 'Description', 'TOR Download Link', 'Scrape Timestamp', 'Language', 'Title_clean', 'Description_clean']

Sample rows:
                                               Title  \
0  Ethiopian Public Health Institute Invites Elig...   
1  St. Peter Specialized Hospital Invites Eligibl...   
2  The Federal Democratic Republic of Ethiopia, M...   
3  Woldia University Invites Eligible Bidders for...   
4  SOS Children’s Villages in Ethiopia want to pr...   

                                         Title_clean  \
0  ethiopian public health institute invites elig...   
1  st. peter specialized hospital invites eligibl...   
2  the federal democratic republic of ethiopia, m...   
3  woldia university invites eligible bidders for...   
4  sos children s villages in ethiopia want to pr...   

                                   Description_clean  
0  Invitation to Bid

In [14]:
input_file = "../data/processed/tenders_clean_2merkato.csv"
chunk_size = 50_000  # adjust if you want smaller/larger chunks

lang_counts = {}

for chunk in pd.read_csv(input_file, chunksize=chunk_size, usecols=["Language"]):
    counts = chunk["Language"].value_counts().to_dict()
    for lang, count in counts.items():
        lang_counts[lang] = lang_counts.get(lang, 0) + count

print("Language distribution:")
for lang, count in lang_counts.items():
    print(f"{lang}: {count}")

Language distribution:
amharic: 208927
english: 132560


In [16]:
# Only load the needed columns
use_cols = ["Title_clean", "Description_clean"]

# Read in chunks
chunksize = 1000  # morethan enough to look at the first five rows
chunks = pd.read_csv("../data/processed/tenders_english_2merkato.csv", usecols=use_cols, chunksize=chunksize)

# Process first chunk only (demo)
for chunk in chunks:
    print(len(chunk))
    print(chunk.head(5))
    break

1000
                                                                                                                                     Title_clean  \
0                            ethiopian public health institute invites eligible bidders for the procurement of laboratory equipment and supplies   
1                                      st. peter specialized hospital invites eligible bidders for the procurement of pharmaceutical store shelf   
2  the federal democratic republic of ethiopia, ministry of finance invite to eligible bidders for the procurement of cooking demonstration kits   
3                                                woldia university invites eligible bidders for the procurement of ict and electronics equipment   
4                                          sos children s villages in ethiopia want to procure sewing and general maintenance and workshop tools   

                                                                                                          

In [18]:
chunks = pd.read_csv("../data/processed/tenders_amharic.csv", usecols=use_cols, chunksize=chunksize)

# Process first chunk only (demo)
for chunk in chunks:
    print(len(chunk))
    print(chunk.head(5))
    break

1000
                                                                                                     Title_clean  \
0   የሞጣ ከተማ አስተዳደር ከተማ ልማት ቤቶችና ኮንስትራክሽን አገልግሎት ጽ/ቤት ለመኖሪያና ንግድ አገልግሎቶች የተዘጋጁ ቦታዎችን በግልጽ ጨረታ ለተጫራቾች ማስተላለፍ ይፈልጋል   
1  የእብናት ወረዳ ገንዘብና ኢኮኖሚ ልማት ጽ/ቤት ለእብናት ወረዳ ትምህርት ጽ/ቤት አገልግሎት የሚውሉ በህብረት 1ኛ ደረጃ G+1 ባለ 10 የመማሪያ ክፍል ለማስገንባት ይፈልጋል   
2                                                      በአብክመ የወሳኝ ኩነቶች ምዝገባ አገልግሎት ለአገልግሎት የሚውሉ ታብሌት ለመግዛት ይፈልጋል   
3                                                             ጣና ማይክሮ ፋይናንስ ተቋም አ.ማ የመኖሪያ ቤት በሐራጅ ጨረታ ለመሸጥ ይፈልጋል   
4                                               ብርሃን ባንክ አ.ማ የተለያዩ ያገለገሉ ተሽከርካሪዎች ባሉበት ሁኔታ በጨረታ አወዳድሮ ለመሸጥ ይፈልጋል   

                                                                                                                                                                                                                                                                                                          

## Tenders categorization