In [None]:
# Install required packages
!pip install requests beautifulsoup4 pandas python-dotenv openpyxl lxml jupyter ipykernel

#%% vscode.cell [id=#VSC-0846c592] [language=python]
import sys
print(sys.executable)
#%% vscode.cell [id=#VSC-0754bb81] [language=python]
import pkg_resources
installed_packages = pkg_resources.working_set
installed_packages_list = sorted(["%s==%s" % (i.key, i.version) for i in installed_packages])
print(installed_packages_list)
#%% vscode.cell [id=#VSC-f34c4cd0] [language=python]
# Africa AI Intelligence Platform - Advanced Multi-API Version
# ==============================================================
# Production-grade data collection using multiple API sources and advanced scraping

# REQUIRED API KEYS (Get free tiers from):
# - Crunchbase: https://data.crunchbase.com/docs
# - GitHub: https://github.com/settings/tokens
# - NewsAPI: https://newsapi.org
# - SerpAPI: https://serpapi.com (Google Search API)
# - PredictHQ: https://www.predicthq.com

import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
from datetime import datetime, timedelta
import time
import logging
from typing import List, Dict, Optional
import os
from urllib.parse import quote_plus
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - [%(levelname)s] - %(message)s'
)
logger = logging.getLogger("AFRICA_AI")

# ============================================================================
# CONFIGURATION - ADD YOUR API KEYS TO .env FILE
# ============================================================================

class APIConfig:
    """Store all API credentials from environment variables"""

    # Get API keys from .env file
    CRUNCHBASE_API_KEY = os.getenv('CRUNCHBASE_API_KEY', 'YOUR_KEY_HERE')
    GITHUB_TOKEN = os.getenv('GITHUB_TOKEN', 'YOUR_KEY_HERE')
    NEWSAPI_KEY = os.getenv('NEWSAPI_API_KEY', 'YOUR_KEY_HERE')
    SERPAPI_KEY = os.getenv('SERPAPI_API_KEY', 'YOUR_KEY_HERE')
    RAPIDAPI_KEY = os.getenv('RAPIDAPI_API_KEY', 'YOUR_KEY_HERE')

    HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }

#%% vscode.cell [id=#VSC-e7fd785c] [language=python]
# ============================================================================
# PIPELINE 1: CRUNCHBASE API - AI STARTUPS & COMPANIES
# ============================================================================

class CrunchbaseAIPipeline:
    """
    Crunchbase API for startup data
    Free tier: 200 calls/day
    Signup: https://data.crunchbase.com/docs
    """

    BASE_URL = "https://api.crunchbase.com/api/v4"

    def __init__(self, api_key: str):
        self.api_key = api_key
        self.session = requests.Session()

    def search_ai_companies(self, country: str, limit: int = 50) -> List[Dict]:
        """Search for AI companies in African countries"""

        endpoint = f"{self.BASE_URL}/searches/organizations"

        # Search query for AI companies
        payload = {
            "field_ids": [
                "identifier", "name", "short_description", "location_identifiers",
                "categories", "num_employees_enum", "founded_on", "website_url"
            ],
            "query": [
                {
                    "type": "predicate",
                    "field_id": "location_identifiers",
                    "operator_id": "includes",
                    "values": [country.lower().replace(' ', '-')]
                },
                {
                    "type": "predicate",
                    "field_id": "categories",
                    "operator_id": "includes",
                    "values": ["artificial-intelligence", "machine-learning", "data-analytics"]
                }
            ],
            "limit": limit
        }

        headers = {
            'X-cb-user-key': self.api_key,
            'Content-Type': 'application/json'
        }

        try:
            response = self.session.post(
                endpoint,
                json=payload,
                headers=headers,
                timeout=30
            )

            if response.status_code == 200:
                data = response.json()
                companies = []

                for entity in data.get('entities', []):
                    props = entity.get('properties', {})
                    companies.append({
                        'company_name': props.get('name'),
                        'description': props.get('short_description'),
                        'country': country,
                        'website': props.get('website_url'),
                        'founded_year': props.get('founded_on', {}).get('value'),
                        'categories': ', '.join([c.get('value') for c in props.get('categories', [])]),
                        'employees': props.get('num_employees_enum'),
                        'source': 'Crunchbase API',
                        'scraped_at': datetime.now().isoformat()
                    })

                logger.info(f"Crunchbase: Found {len(companies)} companies in {country}")
                return companies
            else:
                logger.warning(f"Crunchbase API error: {response.status_code}")
                return []

        except Exception as e:
            logger.error(f"Crunchbase error: {e}")
            return []

    def run(self, countries: List[str]) -> pd.DataFrame:
        """Run for multiple countries"""
        all_companies = []

        for country in countries:
            companies = self.search_ai_companies(country)
            all_companies.extend(companies)
            time.sleep(1)  # Rate limiting

        return pd.DataFrame(all_companies)



#%% vscode.cell [id=#VSC-12c71735] [language=python]
# ============================================================================
# PIPELINE 2: GITHUB API - AI PROJECTS & TOOLS
# ============================================================================

class GitHubAIPipeline:
    """
    GitHub API for AI repositories and projects
    Free tier: 5000 requests/hour with token
    Get token: https://github.com/settings/tokens
    """

    BASE_URL = "https://api.github.com"

    def __init__(self, token: str):
        self.token = token
        self.session = requests.Session()
        self.session.headers.update({
            'Authorization': f'token {token}',
            'Accept': 'application/vnd.github.v3+json'
        })

    def search_african_ai_repos(self, country: str, sector: str = None) -> List[Dict]:
        """Search GitHub for African AI repositories"""

        # Build search query
        search_terms = [
            f"{country} AI",
            f"{country} machine learning",
            f"africa {sector}" if sector else "africa AI"
        ]

        repos = []

        for term in search_terms:
            endpoint = f"{self.BASE_URL}/search/repositories"
            params = {
                'q': f'{term} language:python stars:>5',
                'sort': 'stars',
                'per_page': 30
            }

            try:
                response = self.session.get(endpoint, params=params, timeout=20)

                if response.status_code == 200:
                    data = response.json()

                    for repo in data.get('items', []):
                        # Check if truly African/AI related
                        description = (repo.get('description') or '').lower()
                        readme_url = repo.get('url') + '/readme'

                        repos.append({
                            'project_name': repo.get('full_name'),
                            'description': repo.get('description'),
                            'stars': repo.get('stargazers_count'),
                            'language': repo.get('language'),
                            'url': repo.get('html_url'),
                            'country': country,
                            'sector': sector or 'General AI',
                            'topics': ', '.join(repo.get('topics', [])),
                            'last_updated': repo.get('updated_at'),
                            'source': 'GitHub API',
                            'scraped_at': datetime.now().isoformat()
                        })

                time.sleep(2)  # Rate limiting

            except Exception as e:
                logger.error(f"GitHub API error: {e}")

        logger.info(f"GitHub: Found {len(repos)} repositories for {country}")
        return repos

    def run(self, countries: List[str], sectors: List[str]) -> pd.DataFrame:
        """Run for multiple countries and sectors"""
        all_repos = []

        for country in countries:
            for sector in sectors:
                repos = self.search_african_ai_repos(country, sector)
                all_repos.extend(repos)

        df = pd.DataFrame(all_repos)
        return df.drop_duplicates(subset=['project_name'])



#%% vscode.cell [id=#VSC-8a86fc61] [language=python]
# ============================================================================
# PIPELINE 3: NEWS API - AI NEWS & ADOPTION
# ============================================================================

class NewsAIPipeline:
    """
    NewsAPI for recent AI news and developments
    Free tier: 100 requests/day
    Signup: https://newsapi.org
    """

    BASE_URL = "https://newsapi.org/v2"

    def __init__(self, api_key: str):
        self.api_key = api_key
        self.session = requests.Session()

    def search_ai_news(self, country: str, days_back: int = 30) -> List[Dict]:
        """Search for AI-related news from African countries"""

        from_date = (datetime.now() - timedelta(days=days_back)).strftime('%Y-%m-%d')

        endpoint = f"{self.BASE_URL}/everything"

        queries = [
            f"{country} artificial intelligence",
            f"{country} AI startup",
            f"{country} machine learning technology"
        ]

        articles = []

        for query in queries:
            params = {
                'q': query,
                'from': from_date,
                'language': 'en',
                'sortBy': 'relevancy',
                'apiKey': self.api_key,
                'pageSize': 20
            }

            try:
                response = self.session.get(endpoint, params=params, timeout=20)

                if response.status_code == 200:
                    data = response.json()

                    for article in data.get('articles', []):
                        articles.append({
                            'title': article.get('title'),
                            'description': article.get('description'),
                            'country': country,
                            'source': article.get('source', {}).get('name'),
                            'url': article.get('url'),
                            'published_at': article.get('publishedAt'),
                            'content_preview': article.get('content', '')[:300],
                            'scraped_at': datetime.now().isoformat()
                        })

                time.sleep(1)

            except Exception as e:
                logger.error(f"NewsAPI error: {e}")

        logger.info(f"NewsAPI: Found {len(articles)} articles for {country}")
        return articles

    def run(self, countries: List[str]) -> pd.DataFrame:
        """Run for multiple countries"""
        all_articles = []

        for country in countries:
            articles = self.search_ai_news(country)
            all_articles.extend(articles)

        df = pd.DataFrame(all_articles)
        return df.drop_duplicates(subset=['title', 'url'])



#%% vscode.cell [id=#VSC-96d7b6b1] [language=python]
# ============================================================================
# PIPELINE 4: SERP API - GOOGLE SEARCH FOR GOVERNMENT & POLICY
# ============================================================================

class SerpAPIGovPipeline:
    """
    SerpAPI for Google searches (government AI policies)
    Free tier: 100 searches/month
    Signup: https://serpapi.com
    """

    BASE_URL = "https://serpapi.com/search"

    def __init__(self, api_key: str):
        self.api_key = api_key
        self.session = requests.Session()

    def search_government_ai(self, country: str) -> List[Dict]:
        """Search for government AI initiatives and policies"""

        queries = [
            f"{country} government artificial intelligence strategy",
            f"{country} national AI policy",
            f"{country} public sector AI implementation",
            f"{country} AI governance framework"
        ]

        results = []

        for query in queries:
            params = {
                'q': query,
                'api_key': self.api_key,
                'num': 10,
                'hl': 'en'
            }

            try:
                response = self.session.get(self.BASE_URL, params=params, timeout=20)

                if response.status_code == 200:
                    data = response.json()

                    for item in data.get('organic_results', []):
                        results.append({
                            'title': item.get('title'),
                            'snippet': item.get('snippet'),
                            'url': item.get('link'),
                            'country': country,
                            'query_type': 'Government AI Policy',
                            'position': item.get('position'),
                            'source': 'Google Search (SerpAPI)',
                            'scraped_at': datetime.now().isoformat()
                        })

                time.sleep(2)

            except Exception as e:
                logger.error(f"SerpAPI error: {e}")

        logger.info(f"SerpAPI: Found {len(results)} policy documents for {country}")
        return results

    def run(self, countries: List[str]) -> pd.DataFrame:
        """Run for multiple countries"""
        all_results = []

        for country in countries:
            results = self.search_government_ai(country)
            all_results.extend(results)

        return pd.DataFrame(all_results)


# ============================================================================
# PIPELINE 5: ADVANCED WEB SCRAPING - AFRICAN TECH SITES
# ============================================================================

class AfricanTechScraper:
    """
    Targeted scraping of African tech news sites
    No API required - direct scraping with proper parsing
    """

    SOURCES = {
        'Disrupt Africa': 'https://disrupt-africa.com/category/fintech/',
        'TechCabal': 'https://techcabal.com/category/startups/',
        'African Tech Roundup': 'https://africantechroundup.com',
        'Ventureburn': 'https://ventureburn.com/category/tech/'
    }

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update(APIConfig.HEADERS)

    def scrape_disrupt_africa(self) -> List[Dict]:
        """Scrape Disrupt Africa for AI startups"""

        companies = []
        url = 'https://disrupt-africa.com'

        try:
            response = self.session.get(url, timeout=15)
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find article blocks
            articles = soup.find_all('article', class_='post')

            for article in articles[:20]:
                title_tag = article.find('h2', class_='entry-title')
                if title_tag:
                    title = title_tag.get_text(strip=True)
                    link = title_tag.find('a')['href'] if title_tag.find('a') else None

                    excerpt = article.find('div', class_='entry-excerpt')
                    description = excerpt.get_text(strip=True) if excerpt else ''

                    # Check if AI-related
                    if any(kw in (title + description).lower() for kw in ['ai', 'artificial', 'machine learning', 'ml']):
                        companies.append({
                            'company_name': title,
                            'description': description[:500],
                            'url': link,
                            'source': 'Disrupt Africa',
                            'scraped_at': datetime.now().isoformat()
                        })

            logger.info(f"Disrupt Africa: Scraped {len(companies)} AI companies")

        except Exception as e:
            logger.error(f"Disrupt Africa scraping error: {e}")

        return companies

    def scrape_techcabal(self) -> List[Dict]:
        """Scrape TechCabal for AI news"""

        articles = []
        url = 'https://techcabal.com'

        try:
            response = self.session.get(url, timeout=15)
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find recent articles
            post_links = soup.find_all('h2', class_='post-title')

            for post in post_links[:15]:
                link_tag = post.find('a')
                if link_tag:
                    title = link_tag.get_text(strip=True)
                    link = link_tag['href']

                    if any(kw in title.lower() for kw in ['ai', 'artificial', 'tech', 'startup']):
                        articles.append({
                            'title': title,
                            'url': link,
                            'source': 'TechCabal',
                            'scraped_at': datetime.now().isoformat()
                        })

            logger.info(f"TechCabal: Scraped {len(articles)} articles")

        except Exception as e:
            logger.error(f"TechCabal scraping error: {e}")

        return articles

    def run(self) -> Dict[str, pd.DataFrame]:
        """Run all scrapers"""

        companies = self.scrape_disrupt_africa()
        articles = self.scrape_techcabal()

        return {
            'companies': pd.DataFrame(companies),
            'articles': pd.DataFrame(articles)
        }



#%% vscode.cell [id=#VSC-fdec63d4] [language=python]
# ============================================================================
# MASTER ORCHESTRATOR
# ============================================================================

class AfricaAIMasterPipeline:
    """Orchestrate all data collection pipelines"""

    def __init__(self, config: APIConfig):
        self.config = config
        self.results = {}

    def run_all_pipelines(self, countries: List[str], sectors: List[str]):
        """Execute all data collection pipelines"""

        logger.info("="*80)
        logger.info("STARTING AFRICA AI INTELLIGENCE PLATFORM")
        logger.info("="*80)

        # # Pipeline 1: Crunchbase Companies
        # if self.config.CRUNCHBASE_API_KEY != 'YOUR_KEY_HERE':
        #     logger.info("\n[1/5] Running Crunchbase Pipeline...")
        #     crunchbase = CrunchbaseAIPipeline(self.config.CRUNCHBASE_API_KEY)
        #     self.results['crunchbase_companies'] = crunchbase.run(countries)
        #     if 'crunchbase_companies' in self.results and not self.results['crunchbase_companies'].empty:
        #         logger.info(f"‚úÖ Crunchbase Pipeline finished: Collected {len(self.results['crunchbase_companies'])} companies.")
        #     else:
        #         logger.warning("üö´ Crunchbase Pipeline finished: No companies collected or an issue occurred.")
        # else:
        #     logger.warning("[1/5] Skipping Crunchbase - No API key")

        # Pipeline 2: GitHub Projects
        if self.config.GITHUB_TOKEN != 'YOUR_KEY_HERE':
            logger.info("\n[2/5] Running GitHub Pipeline...")
            github = GitHubAIPipeline(self.config.GITHUB_TOKEN)
            self.results['github_projects'] = github.run(countries, sectors)
            if 'github_projects' in self.results and not self.results['github_projects'].empty:
                logger.info(f"‚úÖ GitHub Pipeline finished: Collected {len(self.results['github_projects'])} projects.")
            else:
                logger.warning("üö´ GitHub Pipeline finished: No projects collected or an issue occurred.")
        else:
            logger.warning("[2/5] Skipping GitHub - No API token")

        # Pipeline 3: News Articles
        if self.config.NEWSAPI_KEY != 'YOUR_KEY_HERE':
            logger.info("\n[3/5] Running NewsAPI Pipeline...")
            news = NewsAIPipeline(self.config.NEWSAPI_KEY)
            self.results['news_articles'] = news.run(countries)
            if 'news_articles' in self.results and not self.results['news_articles'].empty:
                logger.info(f"‚úÖ NewsAPI Pipeline finished: Collected {len(self.results['news_articles'])} articles.")
            else:
                logger.warning("üö´ NewsAPI Pipeline finished: No articles collected or an issue occurred.")
        else:
            logger.warning("[3/5] Skipping NewsAPI - No API key")

        # Pipeline 4: Government Policies
        if self.config.SERPAPI_KEY != 'YOUR_KEY_HERE':
            logger.info("\n[4/5] Running SerpAPI Government Pipeline...")
            serp = SerpAPIGovPipeline(self.config.SERPAPI_KEY)
            self.results['government_policies'] = serp.run(countries)
            if 'government_policies' in self.results and not self.results['government_policies'].empty:
                logger.info(f"‚úÖ SerpAPI Pipeline finished: Collected {len(self.results['government_policies'])} policy documents.")
            else:
                logger.warning("üö´ SerpAPI Pipeline finished: No policy documents collected or an issue occurred.")
        else:
            logger.warning("[4/5] Skipping SerpAPI - No API key")

        # Pipeline 5: African Tech Sites (No API needed)
        logger.info("\n[5/5] Running African Tech Scraper...")
        tech_scraper = AfricanTechScraper()
        scraper_results = tech_scraper.run()
        self.results.update(scraper_results)
        if 'companies' in self.results and not self.results['companies'].empty:
            logger.info(f"‚úÖ African Tech Scraper (Companies) finished: Collected {len(self.results['companies'])} companies.")
        else:
            logger.warning("üö´ African Tech Scraper (Companies) finished: No companies collected or an issue occurred.")
        if 'articles' in self.results and not self.results['articles'].empty:
            logger.info(f"‚úÖ African Tech Scraper (Articles) finished: Collected {len(self.results['articles'])} articles.")
        else:
            logger.warning("üö´ African Tech Scraper (Articles) finished: No articles collected or an issue occurred.")

        logger.info("\n" + "="*80)
        logger.info("ALL PIPELINES COMPLETED")
        logger.info("="*80)

        return self.results

    def generate_summary(self):
        """Generate summary statistics"""
        summary = {
            'timestamp': datetime.now().isoformat(),
            'total_datasets': len(self.results),
            'datasets': {}
        }

        for name, df in self.results.items():
            if isinstance(df, pd.DataFrame):
                summary['datasets'][name] = {
                    'records': len(df),
                    'columns': list(df.columns)
                }

        return summary

    def export_results(self, output_dir: str = './'):
        """Export all results to files"""

        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

        # Check if there's any non-empty DataFrame to export
        has_data = any(isinstance(df, pd.DataFrame) and not df.empty for df in self.results.values())

        if not has_data:
            logger.warning("No data collected to export. Skipping file export.")
            return

        # Export individual CSVs
        for name, df in self.results.items():
            if isinstance(df, pd.DataFrame) and not df.empty:
                filename = f"{output_dir}africa_ai_{name}_{timestamp}.csv"
                df.to_csv(filename, index=False)
                logger.info(f"‚úÖ Exported: {filename} ({len(df)} records)")

        # Export combined Excel
        excel_file = f"{output_dir}africa_ai_complete_{timestamp}.xlsx"
        try:
            with pd.ExcelWriter(excel_file, engine='openpyxl') as writer:
                for name, df in self.results.items():
                    if isinstance(df, pd.DataFrame) and not df.empty:
                        df.to_excel(writer, sheet_name=name[:31], index=False)
            logger.info(f"‚úÖ Combined Excel: {excel_file}")
        except Exception as e:
            logger.error(f"Error exporting combined Excel file: {e}")

        # Export summary
        summary = self.generate_summary()
        summary_file = f"{output_dir}pipeline_summary_{timestamp}.json"
        with open(summary_file, 'w') as f:
            json.dump(summary, f, indent=2)

        logger.info(f"‚úÖ Summary: {summary_file}")

#%% vscode.cell [id=#VSC-4e7bb9a1] [language=python]
import pandas as pd

# ============================================================================
# EXECUTION BLOCK
# ============================================================================

if __name__ == "__main__":

    # Configuration
    COUNTRIES = [
        'Algeria', 'Angola', 'Benin', 'Botswana', 'Burkina Faso', 'Burundi',
        'Cabo Verde', 'Cameroon', 'Central African Republic', 'Chad', 'Comoros',
        'Congo, Dem. Rep.', 'Congo, Rep.', 'Cote dIvoire', 'Djibouti', 'Egypt',
        'Equatorial Guinea', 'Eritrea', 'Eswatini', 'Ethiopia', 'Gabon', 'Gambia',
        'Ghana', 'Guinea', 'Guinea-Bissau', 'Kenya', 'Lesotho', 'Liberia', 'Libya',
        'Madagascar', 'Malawi', 'Mali', 'Mauritania', 'Mauritius', 'Morocco',
        'Mozambique', 'Namibia', 'Niger', 'Nigeria', 'Rwanda', 'Sao Tome and Principe',
        'Senegal', 'Seychelles', 'Sierra Leone', 'Somalia', 'South Africa', 'South Sudan',
        'Sudan', 'Tanzania', 'Togo', 'Tunisia', 'Uganda', 'Zambia', 'Zimbabwe'
    ]

    SECTORS = [
        'Agriculture', 'Healthcare', 'Fintech', 'Education',
        'Logistics', 'Energy', 'Governance', 'Telecommunications', 'Mining',
        'Tourism', 'Manufacturing', 'Retail', 'Media', 'Real Estate',
        'Transportation', 'Financial Services', 'Public Sector', 'Utilities',
        'Environment', 'Smart Cities', 'Security', 'Creative Industry','Insurance'
    ]

    # Initialize pipeline
    config = APIConfig()
    pipeline = AfricaAIMasterPipeline(config)

    # Run all pipelines
    results = pipeline.run_all_pipelines(COUNTRIES, SECTORS)

    # Display results
    print("\n" + "="*80)
    print("RESULTS PREVIEW")
    print("="*80 + "\n")

    for name, df in results.items():
        if isinstance(df, pd.DataFrame):
            print(f"\nüìä {name.upper()}: {len(df)} records")
            print("-" * 60)
            if not df.empty:
                print(df.head(3))

    # Export everything
    pipeline.export_results()

    print("\n‚úÖ Pipeline completed successfully!")
    print("\nüìù NEXT STEPS:")
    print("1. Add your API keys to APIConfig class")
    print("2. Run the pipelines")
    print("3. Check exported CSV and Excel files")
    print("4. Analyze the data for insights\n")

In [2]:
import sys
print(sys.executable)

/usr/local/bin/python3.13


In [3]:
import pkg_resources
installed_packages = pkg_resources.working_set
installed_packages_list = sorted(["%s==%s" % (i.key, i.version) for i in installed_packages])
print(installed_packages_list)

['asttokens==3.0.1', 'autocommand==2.2.2', 'backports.tarfile==1.2.0', 'comm==0.2.3', 'debugpy==1.8.19', 'decorator==5.2.1', 'executing==2.2.1', 'importlib-metadata==8.0.0', 'inflect==7.3.1', 'ipykernel==7.1.0', 'ipython-pygments-lexers==1.1.1', 'ipython==9.9.0', 'jaraco.collections==5.1.0', 'jaraco.context==5.3.0', 'jaraco.functools==4.0.1', 'jaraco.text==3.12.1', 'jedi==0.19.2', 'jupyter-client==8.8.0', 'jupyter-core==5.9.1', 'matplotlib-inline==0.2.1', 'more-itertools==10.3.0', 'nest-asyncio==1.6.0', 'packaging==25.0', 'parso==0.8.5', 'pexpect==4.9.0', 'pip==25.3', 'platformdirs==4.5.1', 'prompt-toolkit==3.0.52', 'psutil==7.2.1', 'ptyprocess==0.7.0', 'pure-eval==0.2.3', 'pygments==2.19.2', 'python-dateutil==2.9.0.post0', 'pyzmq==27.1.0', 'setuptools==80.9.0', 'six==1.17.0', 'stack-data==0.6.3', 'tomli==2.0.1', 'tornado==6.5.4', 'traitlets==5.14.3', 'typeguard==4.3.0', 'typing-extensions==4.12.2', 'wcwidth==0.2.14', 'wheel==0.45.1', 'zipp==3.19.2']


  import pkg_resources


In [1]:
# Africa AI Intelligence Platform - Advanced Multi-API Version
# ==============================================================
# Production-grade data collection using multiple API sources and advanced scraping

# REQUIRED API KEYS (Get free tiers from):
# - Crunchbase: https://data.crunchbase.com/docs
# - GitHub: https://github.com/settings/tokens
# - NewsAPI: https://newsapi.org
# - SerpAPI: https://serpapi.com (Google Search API)
# - PredictHQ: https://www.predicthq.com

import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
from datetime import datetime, timedelta
import time
import logging
from typing import List, Dict, Optional
import os
from urllib.parse import quote_plus
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - [%(levelname)s] - %(message)s'
)
logger = logging.getLogger("AFRICA_AI")

# ============================================================================
# CONFIGURATION - ADD YOUR API KEYS TO .env FILE
# ============================================================================

class APIConfig:
    """Store all API credentials from environment variables"""

    # Get API keys from .env file
    CRUNCHBASE_API_KEY = os.getenv('CRUNCHBASE_API_KEY', 'YOUR_KEY_HERE')
    GITHUB_TOKEN = os.getenv('GITHUB_TOKEN', 'YOUR_KEY_HERE')
    NEWSAPI_KEY = os.getenv('NEWSAPI_API_KEY', 'YOUR_KEY_HERE')
    SERPAPI_KEY = os.getenv('SERPAPI_API_KEY', 'YOUR_KEY_HERE')
    RAPIDAPI_KEY = os.getenv('RAPIDAPI_API_KEY', 'YOUR_KEY_HERE')

    HEADERS = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }


ModuleNotFoundError: No module named 'requests'

In [None]:
# ============================================================================
# PIPELINE 1: CRUNCHBASE API - AI STARTUPS & COMPANIES
# ============================================================================

class CrunchbaseAIPipeline:
    """
    Crunchbase API for startup data
    Free tier: 200 calls/day
    Signup: https://data.crunchbase.com/docs
    """

    BASE_URL = "https://api.crunchbase.com/api/v4"

    def __init__(self, api_key: str):
        self.api_key = api_key
        self.session = requests.Session()

    def search_ai_companies(self, country: str, limit: int = 50) -> List[Dict]:
        """Search for AI companies in African countries"""

        endpoint = f"{self.BASE_URL}/searches/organizations"

        # Search query for AI companies
        payload = {
            "field_ids": [
                "identifier", "name", "short_description", "location_identifiers",
                "categories", "num_employees_enum", "founded_on", "website_url"
            ],
            "query": [
                {
                    "type": "predicate",
                    "field_id": "location_identifiers",
                    "operator_id": "includes",
                    "values": [country.lower().replace(' ', '-')]
                },
                {
                    "type": "predicate",
                    "field_id": "categories",
                    "operator_id": "includes",
                    "values": ["artificial-intelligence", "machine-learning", "data-analytics"]
                }
            ],
            "limit": limit
        }

        headers = {
            'X-cb-user-key': self.api_key,
            'Content-Type': 'application/json'
        }

        try:
            response = self.session.post(
                endpoint,
                json=payload,
                headers=headers,
                timeout=30
            )

            if response.status_code == 200:
                data = response.json()
                companies = []

                for entity in data.get('entities', []):
                    props = entity.get('properties', {})
                    companies.append({
                        'company_name': props.get('name'),
                        'description': props.get('short_description'),
                        'country': country,
                        'website': props.get('website_url'),
                        'founded_year': props.get('founded_on', {}).get('value'),
                        'categories': ', '.join([c.get('value') for c in props.get('categories', [])]),
                        'employees': props.get('num_employees_enum'),
                        'source': 'Crunchbase API',
                        'scraped_at': datetime.now().isoformat()
                    })

                logger.info(f"Crunchbase: Found {len(companies)} companies in {country}")
                return companies
            else:
                logger.warning(f"Crunchbase API error: {response.status_code}")
                return []

        except Exception as e:
            logger.error(f"Crunchbase error: {e}")
            return []

    def run(self, countries: List[str]) -> pd.DataFrame:
        """Run for multiple countries"""
        all_companies = []

        for country in countries:
            companies = self.search_ai_companies(country)
            all_companies.extend(companies)
            time.sleep(1)  # Rate limiting

        return pd.DataFrame(all_companies)




In [None]:
# ============================================================================
# PIPELINE 2: GITHUB API - AI PROJECTS & TOOLS
# ============================================================================

class GitHubAIPipeline:
    """
    GitHub API for AI repositories and projects
    Free tier: 5000 requests/hour with token
    Get token: https://github.com/settings/tokens
    """

    BASE_URL = "https://api.github.com"

    def __init__(self, token: str):
        self.token = token
        self.session = requests.Session()
        self.session.headers.update({
            'Authorization': f'token {token}',
            'Accept': 'application/vnd.github.v3+json'
        })

    def search_african_ai_repos(self, country: str, sector: str = None) -> List[Dict]:
        """Search GitHub for African AI repositories"""

        # Build search query
        search_terms = [
            f"{country} AI",
            f"{country} machine learning",
            f"africa {sector}" if sector else "africa AI"
        ]

        repos = []

        for term in search_terms:
            endpoint = f"{self.BASE_URL}/search/repositories"
            params = {
                'q': f'{term} language:python stars:>5',
                'sort': 'stars',
                'per_page': 30
            }

            try:
                response = self.session.get(endpoint, params=params, timeout=20)

                if response.status_code == 200:
                    data = response.json()

                    for repo in data.get('items', []):
                        # Check if truly African/AI related
                        description = (repo.get('description') or '').lower()
                        readme_url = repo.get('url') + '/readme'

                        repos.append({
                            'project_name': repo.get('full_name'),
                            'description': repo.get('description'),
                            'stars': repo.get('stargazers_count'),
                            'language': repo.get('language'),
                            'url': repo.get('html_url'),
                            'country': country,
                            'sector': sector or 'General AI',
                            'topics': ', '.join(repo.get('topics', [])),
                            'last_updated': repo.get('updated_at'),
                            'source': 'GitHub API',
                            'scraped_at': datetime.now().isoformat()
                        })

                time.sleep(2)  # Rate limiting

            except Exception as e:
                logger.error(f"GitHub API error: {e}")

        logger.info(f"GitHub: Found {len(repos)} repositories for {country}")
        return repos

    def run(self, countries: List[str], sectors: List[str]) -> pd.DataFrame:
        """Run for multiple countries and sectors"""
        all_repos = []

        for country in countries:
            for sector in sectors:
                repos = self.search_african_ai_repos(country, sector)
                all_repos.extend(repos)

        df = pd.DataFrame(all_repos)
        return df.drop_duplicates(subset=['project_name'])




In [None]:
# ============================================================================
# PIPELINE 3: NEWS API - AI NEWS & ADOPTION
# ============================================================================

class NewsAIPipeline:
    """
    NewsAPI for recent AI news and developments
    Free tier: 100 requests/day
    Signup: https://newsapi.org
    """

    BASE_URL = "https://newsapi.org/v2"

    def __init__(self, api_key: str):
        self.api_key = api_key
        self.session = requests.Session()

    def search_ai_news(self, country: str, days_back: int = 30) -> List[Dict]:
        """Search for AI-related news from African countries"""

        from_date = (datetime.now() - timedelta(days=days_back)).strftime('%Y-%m-%d')

        endpoint = f"{self.BASE_URL}/everything"

        queries = [
            f"{country} artificial intelligence",
            f"{country} AI startup",
            f"{country} machine learning technology"
        ]

        articles = []

        for query in queries:
            params = {
                'q': query,
                'from': from_date,
                'language': 'en',
                'sortBy': 'relevancy',
                'apiKey': self.api_key,
                'pageSize': 20
            }

            try:
                response = self.session.get(endpoint, params=params, timeout=20)

                if response.status_code == 200:
                    data = response.json()

                    for article in data.get('articles', []):
                        articles.append({
                            'title': article.get('title'),
                            'description': article.get('description'),
                            'country': country,
                            'source': article.get('source', {}).get('name'),
                            'url': article.get('url'),
                            'published_at': article.get('publishedAt'),
                            'content_preview': article.get('content', '')[:300],
                            'scraped_at': datetime.now().isoformat()
                        })

                time.sleep(1)

            except Exception as e:
                logger.error(f"NewsAPI error: {e}")

        logger.info(f"NewsAPI: Found {len(articles)} articles for {country}")
        return articles

    def run(self, countries: List[str]) -> pd.DataFrame:
        """Run for multiple countries"""
        all_articles = []

        for country in countries:
            articles = self.search_ai_news(country)
            all_articles.extend(articles)

        df = pd.DataFrame(all_articles)
        return df.drop_duplicates(subset=['title', 'url'])




In [None]:
# ============================================================================
# PIPELINE 4: SERP API - GOOGLE SEARCH FOR GOVERNMENT & POLICY
# ============================================================================

class SerpAPIGovPipeline:
    """
    SerpAPI for Google searches (government AI policies)
    Free tier: 100 searches/month
    Signup: https://serpapi.com
    """

    BASE_URL = "https://serpapi.com/search"

    def __init__(self, api_key: str):
        self.api_key = api_key
        self.session = requests.Session()

    def search_government_ai(self, country: str) -> List[Dict]:
        """Search for government AI initiatives and policies"""

        queries = [
            f"{country} government artificial intelligence strategy",
            f"{country} national AI policy",
            f"{country} public sector AI implementation",
            f"{country} AI governance framework"
        ]

        results = []

        for query in queries:
            params = {
                'q': query,
                'api_key': self.api_key,
                'num': 10,
                'hl': 'en'
            }

            try:
                response = self.session.get(self.BASE_URL, params=params, timeout=20)

                if response.status_code == 200:
                    data = response.json()

                    for item in data.get('organic_results', []):
                        results.append({
                            'title': item.get('title'),
                            'snippet': item.get('snippet'),
                            'url': item.get('link'),
                            'country': country,
                            'query_type': 'Government AI Policy',
                            'position': item.get('position'),
                            'source': 'Google Search (SerpAPI)',
                            'scraped_at': datetime.now().isoformat()
                        })

                time.sleep(2)

            except Exception as e:
                logger.error(f"SerpAPI error: {e}")

        logger.info(f"SerpAPI: Found {len(results)} policy documents for {country}")
        return results

    def run(self, countries: List[str]) -> pd.DataFrame:
        """Run for multiple countries"""
        all_results = []

        for country in countries:
            results = self.search_government_ai(country)
            all_results.extend(results)

        return pd.DataFrame(all_results)


# ============================================================================
# PIPELINE 5: ADVANCED WEB SCRAPING - AFRICAN TECH SITES
# ============================================================================

class AfricanTechScraper:
    """
    Targeted scraping of African tech news sites
    No API required - direct scraping with proper parsing
    """

    SOURCES = {
        'Disrupt Africa': 'https://disrupt-africa.com/category/fintech/',
        'TechCabal': 'https://techcabal.com/category/startups/',
        'African Tech Roundup': 'https://africantechroundup.com',
        'Ventureburn': 'https://ventureburn.com/category/tech/'
    }

    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update(APIConfig.HEADERS)

    def scrape_disrupt_africa(self) -> List[Dict]:
        """Scrape Disrupt Africa for AI startups"""

        companies = []
        url = 'https://disrupt-africa.com'

        try:
            response = self.session.get(url, timeout=15)
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find article blocks
            articles = soup.find_all('article', class_='post')

            for article in articles[:20]:
                title_tag = article.find('h2', class_='entry-title')
                if title_tag:
                    title = title_tag.get_text(strip=True)
                    link = title_tag.find('a')['href'] if title_tag.find('a') else None

                    excerpt = article.find('div', class_='entry-excerpt')
                    description = excerpt.get_text(strip=True) if excerpt else ''

                    # Check if AI-related
                    if any(kw in (title + description).lower() for kw in ['ai', 'artificial', 'machine learning', 'ml']):
                        companies.append({
                            'company_name': title,
                            'description': description[:500],
                            'url': link,
                            'source': 'Disrupt Africa',
                            'scraped_at': datetime.now().isoformat()
                        })

            logger.info(f"Disrupt Africa: Scraped {len(companies)} AI companies")

        except Exception as e:
            logger.error(f"Disrupt Africa scraping error: {e}")

        return companies

    def scrape_techcabal(self) -> List[Dict]:
        """Scrape TechCabal for AI news"""

        articles = []
        url = 'https://techcabal.com'

        try:
            response = self.session.get(url, timeout=15)
            soup = BeautifulSoup(response.text, 'html.parser')

            # Find recent articles
            post_links = soup.find_all('h2', class_='post-title')

            for post in post_links[:15]:
                link_tag = post.find('a')
                if link_tag:
                    title = link_tag.get_text(strip=True)
                    link = link_tag['href']

                    if any(kw in title.lower() for kw in ['ai', 'artificial', 'tech', 'startup']):
                        articles.append({
                            'title': title,
                            'url': link,
                            'source': 'TechCabal',
                            'scraped_at': datetime.now().isoformat()
                        })

            logger.info(f"TechCabal: Scraped {len(articles)} articles")

        except Exception as e:
            logger.error(f"TechCabal scraping error: {e}")

        return articles

    def run(self) -> Dict[str, pd.DataFrame]:
        """Run all scrapers"""

        companies = self.scrape_disrupt_africa()
        articles = self.scrape_techcabal()

        return {
            'companies': pd.DataFrame(companies),
            'articles': pd.DataFrame(articles)
        }




In [None]:
# ============================================================================
# MASTER ORCHESTRATOR
# ============================================================================

class AfricaAIMasterPipeline:
    """Orchestrate all data collection pipelines"""

    def __init__(self, config: APIConfig):
        self.config = config
        self.results = {}

    def run_all_pipelines(self, countries: List[str], sectors: List[str]):
        """Execute all data collection pipelines"""

        logger.info("="*80)
        logger.info("STARTING AFRICA AI INTELLIGENCE PLATFORM")
        logger.info("="*80)

        # # Pipeline 1: Crunchbase Companies
        # if self.config.CRUNCHBASE_API_KEY != 'YOUR_KEY_HERE':
        #     logger.info("\n[1/5] Running Crunchbase Pipeline...")
        #     crunchbase = CrunchbaseAIPipeline(self.config.CRUNCHBASE_API_KEY)
        #     self.results['crunchbase_companies'] = crunchbase.run(countries)
        #     if 'crunchbase_companies' in self.results and not self.results['crunchbase_companies'].empty:
        #         logger.info(f"‚úÖ Crunchbase Pipeline finished: Collected {len(self.results['crunchbase_companies'])} companies.")
        #     else:
        #         logger.warning("üö´ Crunchbase Pipeline finished: No companies collected or an issue occurred.")
        # else:
        #     logger.warning("[1/5] Skipping Crunchbase - No API key")

        # Pipeline 2: GitHub Projects
        if self.config.GITHUB_TOKEN != 'YOUR_KEY_HERE':
            logger.info("\n[2/5] Running GitHub Pipeline...")
            github = GitHubAIPipeline(self.config.GITHUB_TOKEN)
            self.results['github_projects'] = github.run(countries, sectors)
            if 'github_projects' in self.results and not self.results['github_projects'].empty:
                logger.info(f"‚úÖ GitHub Pipeline finished: Collected {len(self.results['github_projects'])} projects.")
            else:
                logger.warning("üö´ GitHub Pipeline finished: No projects collected or an issue occurred.")
        else:
            logger.warning("[2/5] Skipping GitHub - No API token")

        # Pipeline 3: News Articles
        if self.config.NEWSAPI_KEY != 'YOUR_KEY_HERE':
            logger.info("\n[3/5] Running NewsAPI Pipeline...")
            news = NewsAIPipeline(self.config.NEWSAPI_KEY)
            self.results['news_articles'] = news.run(countries)
            if 'news_articles' in self.results and not self.results['news_articles'].empty:
                logger.info(f"‚úÖ NewsAPI Pipeline finished: Collected {len(self.results['news_articles'])} articles.")
            else:
                logger.warning("üö´ NewsAPI Pipeline finished: No articles collected or an issue occurred.")
        else:
            logger.warning("[3/5] Skipping NewsAPI - No API key")

        # Pipeline 4: Government Policies
        if self.config.SERPAPI_KEY != 'YOUR_KEY_HERE':
            logger.info("\n[4/5] Running SerpAPI Government Pipeline...")
            serp = SerpAPIGovPipeline(self.config.SERPAPI_KEY)
            self.results['government_policies'] = serp.run(countries)
            if 'government_policies' in self.results and not self.results['government_policies'].empty:
                logger.info(f"‚úÖ SerpAPI Pipeline finished: Collected {len(self.results['government_policies'])} policy documents.")
            else:
                logger.warning("üö´ SerpAPI Pipeline finished: No policy documents collected or an issue occurred.")
        else:
            logger.warning("[4/5] Skipping SerpAPI - No API key")

        # Pipeline 5: African Tech Sites (No API needed)
        logger.info("\n[5/5] Running African Tech Scraper...")
        tech_scraper = AfricanTechScraper()
        scraper_results = tech_scraper.run()
        self.results.update(scraper_results)
        if 'companies' in self.results and not self.results['companies'].empty:
            logger.info(f"‚úÖ African Tech Scraper (Companies) finished: Collected {len(self.results['companies'])} companies.")
        else:
            logger.warning("üö´ African Tech Scraper (Companies) finished: No companies collected or an issue occurred.")
        if 'articles' in self.results and not self.results['articles'].empty:
            logger.info(f"‚úÖ African Tech Scraper (Articles) finished: Collected {len(self.results['articles'])} articles.")
        else:
            logger.warning("üö´ African Tech Scraper (Articles) finished: No articles collected or an issue occurred.")

        logger.info("\n" + "="*80)
        logger.info("ALL PIPELINES COMPLETED")
        logger.info("="*80)

        return self.results

    def generate_summary(self):
        """Generate summary statistics"""
        summary = {
            'timestamp': datetime.now().isoformat(),
            'total_datasets': len(self.results),
            'datasets': {}
        }

        for name, df in self.results.items():
            if isinstance(df, pd.DataFrame):
                summary['datasets'][name] = {
                    'records': len(df),
                    'columns': list(df.columns)
                }

        return summary

    def export_results(self, output_dir: str = './'):
        """Export all results to files"""

        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

        # Check if there's any non-empty DataFrame to export
        has_data = any(isinstance(df, pd.DataFrame) and not df.empty for df in self.results.values())

        if not has_data:
            logger.warning("No data collected to export. Skipping file export.")
            return

        # Export individual CSVs
        for name, df in self.results.items():
            if isinstance(df, pd.DataFrame) and not df.empty:
                filename = f"{output_dir}africa_ai_{name}_{timestamp}.csv"
                df.to_csv(filename, index=False)
                logger.info(f"‚úÖ Exported: {filename} ({len(df)} records)")

        # Export combined Excel
        excel_file = f"{output_dir}africa_ai_complete_{timestamp}.xlsx"
        try:
            with pd.ExcelWriter(excel_file, engine='openpyxl') as writer:
                for name, df in self.results.items():
                    if isinstance(df, pd.DataFrame) and not df.empty:
                        df.to_excel(writer, sheet_name=name[:31], index=False)
            logger.info(f"‚úÖ Combined Excel: {excel_file}")
        except Exception as e:
            logger.error(f"Error exporting combined Excel file: {e}")

        # Export summary
        summary = self.generate_summary()
        summary_file = f"{output_dir}pipeline_summary_{timestamp}.json"
        with open(summary_file, 'w') as f:
            json.dump(summary, f, indent=2)

        logger.info(f"‚úÖ Summary: {summary_file}")


In [None]:
import pandas as pd

# ============================================================================
# EXECUTION BLOCK
# ============================================================================

if __name__ == "__main__":

    # Configuration
    COUNTRIES = [
        'Algeria', 'Angola', 'Benin', 'Botswana', 'Burkina Faso', 'Burundi',
        'Cabo Verde', 'Cameroon', 'Central African Republic', 'Chad', 'Comoros',
        'Congo, Dem. Rep.', 'Congo, Rep.', 'Cote dIvoire', 'Djibouti', 'Egypt',
        'Equatorial Guinea', 'Eritrea', 'Eswatini', 'Ethiopia', 'Gabon', 'Gambia',
        'Ghana', 'Guinea', 'Guinea-Bissau', 'Kenya', 'Lesotho', 'Liberia', 'Libya',
        'Madagascar', 'Malawi', 'Mali', 'Mauritania', 'Mauritius', 'Morocco',
        'Mozambique', 'Namibia', 'Niger', 'Nigeria', 'Rwanda', 'Sao Tome and Principe',
        'Senegal', 'Seychelles', 'Sierra Leone', 'Somalia', 'South Africa', 'South Sudan',
        'Sudan', 'Tanzania', 'Togo', 'Tunisia', 'Uganda', 'Zambia', 'Zimbabwe'
    ]

    SECTORS = [
        'Agriculture', 'Healthcare', 'Fintech', 'Education',
        'Logistics', 'Energy', 'Governance', 'Telecommunications', 'Mining',
        'Tourism', 'Manufacturing', 'Retail', 'Media', 'Real Estate',
        'Transportation', 'Financial Services', 'Public Sector', 'Utilities',
        'Environment', 'Smart Cities', 'Security', 'Creative Industry','Insurance'
    ]

    # Initialize pipeline
    config = APIConfig()
    pipeline = AfricaAIMasterPipeline(config)

    # Run all pipelines
    results = pipeline.run_all_pipelines(COUNTRIES, SECTORS)

    # Display results
    print("\n" + "="*80)
    print("RESULTS PREVIEW")
    print("="*80 + "\n")

    for name, df in results.items():
        if isinstance(df, pd.DataFrame):
            print(f"\nüìä {name.upper()}: {len(df)} records")
            print("-" * 60)
            if not df.empty:
                print(df.head(3))

    # Export everything
    pipeline.export_results()

    print("\n‚úÖ Pipeline completed successfully!")
    print("\nüìù NEXT STEPS:")
    print("1. Add your API keys to APIConfig class")
    print("2. Run the pipelines")
    print("3. Check exported CSV and Excel files")
    print("4. Analyze the data for insights\n")