In [20]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue"

response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

table = soup.find("table", {"class": "wikitable"})

headers = [header.text.strip() for header in table.find_all("th")]

rows = []
for row in table.find_all("tr")[1:]:  # Skip the header row
    cells = [cell.text.strip() for cell in row.find_all(["td", "th"])]
    rows.append(cells)

df = pd.DataFrame(rows, columns=headers)
df.to_csv("largest_companies_by_revenue.csv", index=False)

In [21]:
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters
0,1,Walmart,Retail,648125,6.0%,2100000,"Bentonville, Arkansas"
1,2,Amazon,Retail and cloud computing,574785,11.9%,1525000,"Seattle, Washington"
2,3,Apple,Electronics industry,383482,-2.8%,161000,"Cupertino, California"
3,4,UnitedHealth Group,Healthcare,371622,14.6%,440000,"Minnetonka, Minnesota"
4,5,Berkshire Hathaway,Conglomerate,364482,20.7%,396500,"Omaha, Nebraska"
...,...,...,...,...,...,...,...
95,96,TIAA,Financials,45735,11.8%,16023,"New York City, New York"
96,97,CHS,Agriculture cooperative,45590,-4.6%,10609,"Inver Grove Heights, Minnesota"
97,98,Bristol-Myers Squibb,Pharmaceutical industry,45006,-2.5%,34100,"New York City, New York"
98,99,Dow Chemical Company,Chemical industry,44622,-21.6%,35900,"Midland, Michigan"


In [22]:
import random
df['phone'] = [f"{random.randint(100, 999)}-{random.randint(100, 999)}-{random.randint(100, 999)}" for _ in range(len(df))]
# Display the updated DataFrame
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters,phone
0,1,Walmart,Retail,648125,6.0%,2100000,"Bentonville, Arkansas",828-923-366
1,2,Amazon,Retail and cloud computing,574785,11.9%,1525000,"Seattle, Washington",106-344-678
2,3,Apple,Electronics industry,383482,-2.8%,161000,"Cupertino, California",232-947-683
3,4,UnitedHealth Group,Healthcare,371622,14.6%,440000,"Minnetonka, Minnesota",966-464-602
4,5,Berkshire Hathaway,Conglomerate,364482,20.7%,396500,"Omaha, Nebraska",359-251-847
...,...,...,...,...,...,...,...,...
95,96,TIAA,Financials,45735,11.8%,16023,"New York City, New York",310-252-459
96,97,CHS,Agriculture cooperative,45590,-4.6%,10609,"Inver Grove Heights, Minnesota",785-870-927
97,98,Bristol-Myers Squibb,Pharmaceutical industry,45006,-2.5%,34100,"New York City, New York",724-971-443
98,99,Dow Chemical Company,Chemical industry,44622,-21.6%,35900,"Midland, Michigan",507-240-520


In [23]:
df['email'] = ['admin@' + name.replace(' ', '').lower() + '.com' for name in df['Name']]


In [24]:
df = df.rename(columns={
    'Name': 'name',
    'Revenue (USD millions)' : 'amount_donated'
})

In [25]:
df['amount_donated'] = df['amount_donated'].str.replace(',', '').astype(int)


In [26]:
df

Unnamed: 0,Rank,name,Industry,amount_donated,Revenue growth,Employees,Headquarters,phone,email
0,1,Walmart,Retail,648125,6.0%,2100000,"Bentonville, Arkansas",828-923-366,admin@walmart.com
1,2,Amazon,Retail and cloud computing,574785,11.9%,1525000,"Seattle, Washington",106-344-678,admin@amazon.com
2,3,Apple,Electronics industry,383482,-2.8%,161000,"Cupertino, California",232-947-683,admin@apple.com
3,4,UnitedHealth Group,Healthcare,371622,14.6%,440000,"Minnetonka, Minnesota",966-464-602,admin@unitedhealthgroup.com
4,5,Berkshire Hathaway,Conglomerate,364482,20.7%,396500,"Omaha, Nebraska",359-251-847,admin@berkshirehathaway.com
...,...,...,...,...,...,...,...,...,...
95,96,TIAA,Financials,45735,11.8%,16023,"New York City, New York",310-252-459,admin@tiaa.com
96,97,CHS,Agriculture cooperative,45590,-4.6%,10609,"Inver Grove Heights, Minnesota",785-870-927,admin@chs.com
97,98,Bristol-Myers Squibb,Pharmaceutical industry,45006,-2.5%,34100,"New York City, New York",724-971-443,admin@bristol-myerssquibb.com
98,99,Dow Chemical Company,Chemical industry,44622,-21.6%,35900,"Midland, Michigan",507-240-520,admin@dowchemicalcompany.com


In [27]:
df = df.drop(columns=['Industry', 'Rank','Employees', 'Headquarters', 'Revenue growth'])

In [28]:
df

Unnamed: 0,name,amount_donated,phone,email
0,Walmart,648125,828-923-366,admin@walmart.com
1,Amazon,574785,106-344-678,admin@amazon.com
2,Apple,383482,232-947-683,admin@apple.com
3,UnitedHealth Group,371622,966-464-602,admin@unitedhealthgroup.com
4,Berkshire Hathaway,364482,359-251-847,admin@berkshirehathaway.com
...,...,...,...,...
95,TIAA,45735,310-252-459,admin@tiaa.com
96,CHS,45590,785-870-927,admin@chs.com
97,Bristol-Myers Squibb,45006,724-971-443,admin@bristol-myerssquibb.com
98,Dow Chemical Company,44622,507-240-520,admin@dowchemicalcompany.com


In [29]:
df.to_csv("sponsors.csv", index=False)

In [30]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = 'https://en.wikipedia.org/wiki/List_of_Nobel_laureates'  
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', {'class': 'wikitable'})

headers = ['Year']
for th in table.find_all('th')[1:7]:
    headers.append(th.get_text(strip=True))

rows = []
for row in table.find_all('tr')[1:]:
    cols = row.find_all('td')[:6] 
    if cols:
        year = row.find('th').get_text(strip=True)  
        cols_text = [year] + [col.get_text(strip=True) for col in cols]  
        rows.append(cols_text)

df = pd.DataFrame(rows, columns=headers)

df['Year'] = df['Year'].astype(int)
df = df[df['Year'] >= 2000]
df['Economics'] = df['Prize in Economic Sciences[13][a]']
df['Medicine'] = df["Physiologyor Medicine"]
df = df.drop(columns=['Prize in Economic Sciences[13][a]',"Physiologyor Medicine"])

records = []

for index, row in df.iterrows():
    for department in ['Physics', 'Chemistry', 'Literature', 'Peace', 'Economics', 'Medicine']:
        first_name = row[department].split(';')[0].strip()
        records.append({'Name': first_name, 'Year': row['Year'], 'Department': department})

df = pd.DataFrame(records)


In [31]:
df

Unnamed: 0,Name,Year,Department
0,Jack Kilby,2000,Physics
1,Alan J. Heeger,2000,Chemistry
2,Gao Xingjian,2000,Literature
3,Kim Dae-jung,2000,Peace
4,James Heckman,2000,Economics
...,...,...,...
145,David Baker,2024,Chemistry
146,Han Kang,2024,Literature
147,Nihon Hidankyo,2024,Peace
148,Daron Acemoglu,2024,Economics


In [32]:
import random

df['email'] = ['info@' + name.replace(' ', '').lower() + '.com' for name in df['Name']]
df['phone'] = [f"{random.randint(100, 999)}-{random.randint(100, 999)}-{random.randint(100, 999)}" for _ in range(len(df))]
df['first_name'] = df['Name'].apply(lambda x: x.split()[0])
df['last_name'] = df['Name'].apply(lambda x: x.split()[-1])


In [33]:
df = df.rename(columns={
    'Name': 'name',
    'Year': 'year',
    'Department': 'department'
    })

In [34]:
df.to_csv('researcher.csv', index=False)