<a href="https://colab.research.google.com/github/uroosa114/Canoo_assigment-/blob/main/canoo_assigment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install duckduckgo_search

Collecting duckduckgo_search
  Downloading duckduckgo_search-4.4.3-py3-none-any.whl (20 kB)
Collecting curl-cffi>=0.6.0b9 (from duckduckgo_search)
  Downloading curl_cffi-0.6.0b9-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting lxml>=5.1.0 (from duckduckgo_search)
  Downloading lxml-5.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.0/8.0 MB[0m [31m58.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: lxml, curl-cffi, duckduckgo_search
  Attempting uninstall: lxml
    Found existing installation: lxml 4.9.4
    Uninstalling lxml-4.9.4:
      Successfully uninstalled lxml-4.9.4
Successfully installed curl-cffi-0.6.0b9 duckduckgo_search-4.4.3 lxml-5.1.0


In [None]:
import pandas as pd
from duckduckgo_search import DDGS
import asyncio
import aiohttp
from bs4 import BeautifulSoup
import csv

# Search queries
queries = [
    "Identify the industry in which Canoo operates, along with its size, growth rate, trends, and key players",
    "Analyze Canoo's main competitors, including their market share, products or services offered, pricing strategies, and marketing efforts",
    "Identify key trends in the market, including changes in consumer behavior, technological advancements, and shifts in the competitive landscape",
    "Gather information on Canoo's financial performance, including its revenue, profit margins, return on investment, and expense structure."
]

# Initialize an empty DataFrame to store the query results
query_results_df = pd.DataFrame()

# Perform search and store results in DataFrame
for query in queries:
    with DDGS() as ddgs:
        results = [{'title': r['title'], 'url': r['href']} for r in ddgs.text(query, max_results=10)]
        df = pd.DataFrame(results)
        query_results_df = pd.concat([query_results_df, df], ignore_index=True)

# Export the query results DataFrame to a CSV file
query_results_df.to_csv('query_results.csv', index=False)

# Initialize lists to store scraped data
urls = []
texts = []
titles = []

# Asynchronous scraping function
async def save_content(title, url, text):
    words = text.split()
    truncated_text = ' '.join(words[:2000])
    titles.append(title)
    urls.append(url)
    texts.append(truncated_text)

async def scrape_content(title, url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:
            if resp.status == 200:
                body = await resp.text()
                soup = BeautifulSoup(body, 'html.parser')
                content = soup.find_all(['p', 'span'])
                text = ' '.join([p.get_text().strip() for p in content])
            else:
                text = ''
        await save_content(title, url, text)

# Main asynchronous function
async def main():
    tasks = []
    # Read query results CSV and initiate scraping tasks
    with open('query_results.csv') as file:
        csv_reader = csv.DictReader(file)
        for csv_row in csv_reader:
            task = asyncio.create_task(scrape_content(csv_row['title'], csv_row['url']))
            tasks.append(task)

    await asyncio.gather(*tasks)

    # Create DataFrame from scraped data and export to CSV
    result_df = pd.DataFrame({'Title': titles, 'URL': urls, 'Text': texts})
    result_df.to_csv('scraped_data.csv', index=False)

# Run the main asynchronous function
asyncio.run(main())
