In [None]:
import pandas as pd
import csv
import sys

csv.field_size_limit(sys.maxsize)

file_in = "../data/raw/tenders.csv"

# Try reading with Python's CSV engine (more tolerant than default)
df = pd.read_csv(file_in, engine="python")
# Check basic info
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
print("Sample:")
print(df.head())

Shape: (23, 9)
Columns: ['Title', 'URL', 'Closing Date', 'Published On', 'Region', 'Bidding Status', 'Description', 'TOR Download Link', 'Scrape Timestamp']
Sample:
                                               Title  \
0  The National Veterinary Institute (NVI) here n...   
1  The Development Bank of Ethiopia (DBE) like to...   
2  The Hailemariam & Roman Foundation has receive...   
3  Ipas Ethiopia Invites Eligible Bidders for the...   
4  Ethiopian Agricultural Transformation Institut...   

                                                 URL  \
0  https://tender.2merkato.com/tenders/67fcf6ab28...   
1  https://tender.2merkato.com/tenders/67fce9ccd8...   
2  https://tender.2merkato.com/tenders/67fcdb00d2...   
3  https://tender.2merkato.com/tenders/67fccf5c62...   
4  https://tender.2merkato.com/tenders/67fcc28eea...   

                                        Closing Date  Published On  \
0                             June 11, 2025 11:00 AM  Apr 13, 2025   
1                    

In [None]:
# Load cleaned CSV
df = pd.read_csv("../data/processed/tenders_clean.csv")

# Inspect structure
print("Columns:", df.columns.tolist())
print("Shape:", df.shape)

# Peek at first rows
print("\nSample rows:")
print(df[["Title", "Title_clean", "Description", "Description_clean"]].head(10))

# Check for missing values
print("\nMissing values:")
print(df.isna().sum())

# Check average lengths (before vs after cleaning)
df["title_len_raw"] = df["Title"].astype(str).str.len()
df["title_len_clean"] = df["Title_clean"].astype(str).str.len()
df["desc_len_raw"] = df["Description"].astype(str).str.len()
df["desc_len_clean"] = df["Description_clean"].astype(str).str.len()

print("\nAverage lengths (raw vs cleaned):")
print(df[["title_len_raw", "title_len_clean", "desc_len_raw", "desc_len_clean"]].mean())

# Spot-check random cleaned samples
print("\nRandom sample of cleaned text:")
print(df[["Title_clean", "Description_clean"]].sample(5, random_state=42))

Columns: ['Title', 'URL', 'Closing Date', 'Published On', 'Region', 'Bidding Status', 'Description', 'TOR Download Link', 'Scrape Timestamp', 'Title_clean', 'Description_clean']
Shape: (23, 11)

Sample rows:
                                               Title  \
0  The National Veterinary Institute (NVI) here n...   
1  The Development Bank of Ethiopia (DBE) like to...   
2  The Hailemariam & Roman Foundation has receive...   
3  Ipas Ethiopia Invites Eligible Bidders for the...   
4  Ethiopian Agricultural Transformation Institut...   
5  Ministry of Water and Energy invite to eligibl...   
6  The Ministry of Agriculture (MoA) now invites ...   
7  The National Veterinary Institute (NVI) here n...   
8  The Hailemariam & Roman Foundation has receive...   
9  The National Veterinary Institute (NVI) here n...   

                                         Title_clean  \
0  the national veterinary institute nvi here now...   
1  the development bank of ethiopia dbe like to r...   
2  the 

In [None]:
# Configure pandas to display full text
pd.set_option("display.max_colwidth", None)  # None means no truncation
pd.set_option("display.max_rows", 10)        # Optional, just to avoid huge output

# Display first 5 rows of cleaned columns
print(df[["Title_clean", "Description_clean"]].head(5))


                                                                                                                                                                                                                                   Title_clean  \
0  the national veterinary institute nvi here now invites sealed bids from eligible bidders all over the world for the procurement of consultancy service for feasibility study for the expansion of veterinary vaccine manufacturing facility   
1                          the development bank of ethiopia dbe like to receive expressions of interest eois from legally formed consulting firms to conduct consultancy service for the development of wholesale islamic financing mechanisms   
2                                                        the hailemariam roman foundation has received a financing from agence francaise de developement afd , intends to procure consultancy service to undertake agrarian analysis study aas   
3                               

In [None]:
input_file = "../data/processed/tenders_clean.csv"
chunk_size = 50_000  # adjust if you want smaller/larger chunks

lang_counts = {}

for chunk in pd.read_csv(input_file, chunksize=chunk_size, usecols=["Language"]):
    counts = chunk["Language"].value_counts().to_dict()
    for lang, count in counts.items():
        lang_counts[lang] = lang_counts.get(lang, 0) + count

print("Language distribution:")
for lang, count in lang_counts.items():
    print(f"{lang}: {count}")

In [None]:
# Only load the needed columns
use_cols = ["Title_clean", "Description_clean"]

# Read in chunks
chunksize = 1000  # morethan enough to look at the first five rows
chunks = pd.read_csv("tenders_clean.csv", usecols=use_cols, chunksize=chunksize)

# Process first chunk only (demo)
for chunk in chunks:
    print(len(chunk))
    print(chunk.head(5))
    break

In [None]:
chunks = pd.read_csv("tenders_english.csv", usecols=use_cols, chunksize=chunksize)

# Process first chunk only (demo)
for chunk in chunks:
    print(len(chunk))
    print(chunk.head(5))
    break

In [None]:
chunks = pd.read_csv("tenders_amharic.csv", usecols=use_cols, chunksize=chunksize)

# Process first chunk only (demo)
for chunk in chunks:
    print(len(chunk))
    print(chunk.head(5))
    break