In [66]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue"

response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

table = soup.find("table", {"class": "wikitable"})

headers = [header.text.strip() for header in table.find_all("th")]

rows = []
for row in table.find_all("tr")[1:]:  # Skip the header row
    cells = [cell.text.strip() for cell in row.find_all(["td", "th"])]
    rows.append(cells)

df = pd.DataFrame(rows, columns=headers)
df.to_csv("largest_companies_by_revenue.csv", index=False)

In [67]:
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters
0,1,Walmart,Retail,648125,6.0%,2100000,"Bentonville, Arkansas"
1,2,Amazon,Retail and cloud computing,574785,11.9%,1525000,"Seattle, Washington"
2,3,Apple,Electronics industry,383482,-2.8%,161000,"Cupertino, California"
3,4,UnitedHealth Group,Healthcare,371622,14.6%,440000,"Minnetonka, Minnesota"
4,5,Berkshire Hathaway,Conglomerate,364482,20.7%,396500,"Omaha, Nebraska"
...,...,...,...,...,...,...,...
95,96,TIAA,Financials,45735,11.8%,16023,"New York City, New York"
96,97,CHS,Agriculture cooperative,45590,-4.6%,10609,"Inver Grove Heights, Minnesota"
97,98,Bristol-Myers Squibb,Pharmaceutical industry,45006,-2.5%,34100,"New York City, New York"
98,99,Dow Chemical Company,Chemical industry,44622,-21.6%,35900,"Midland, Michigan"


In [68]:
from faker import Faker

# Initialize the Faker instance
fake = Faker()
# Generate a list of random phone numbers
df['phone'] = [fake.numerify(text='#########') for _ in range(len(df))]  # 9 digits]
# Display the updated DataFrame
df

Unnamed: 0,Rank,Name,Industry,Revenue (USD millions),Revenue growth,Employees,Headquarters,phone
0,1,Walmart,Retail,648125,6.0%,2100000,"Bentonville, Arkansas",742227453
1,2,Amazon,Retail and cloud computing,574785,11.9%,1525000,"Seattle, Washington",090897093
2,3,Apple,Electronics industry,383482,-2.8%,161000,"Cupertino, California",082940234
3,4,UnitedHealth Group,Healthcare,371622,14.6%,440000,"Minnetonka, Minnesota",423402643
4,5,Berkshire Hathaway,Conglomerate,364482,20.7%,396500,"Omaha, Nebraska",238507942
...,...,...,...,...,...,...,...,...
95,96,TIAA,Financials,45735,11.8%,16023,"New York City, New York",037829257
96,97,CHS,Agriculture cooperative,45590,-4.6%,10609,"Inver Grove Heights, Minnesota",196706692
97,98,Bristol-Myers Squibb,Pharmaceutical industry,45006,-2.5%,34100,"New York City, New York",996301454
98,99,Dow Chemical Company,Chemical industry,44622,-21.6%,35900,"Midland, Michigan",682620941


In [69]:
df['email'] = ['admin@' + name.replace(' ', '').lower() + '.com' for name in df['Name']]


In [70]:
df = df.rename(columns={
    'Name': 'name',
    'Revenue (USD millions)' : 'amount_donated'
})

In [71]:
df['amount_donated'] = df['amount_donated'].str.replace(',', '').astype(int)


In [74]:
df

Unnamed: 0,name,amount_donated,phone,email
0,Walmart,648125,742227453,admin@walmart.com
1,Amazon,574785,090897093,admin@amazon.com
2,Apple,383482,082940234,admin@apple.com
3,UnitedHealth Group,371622,423402643,admin@unitedhealthgroup.com
4,Berkshire Hathaway,364482,238507942,admin@berkshirehathaway.com
...,...,...,...,...
95,TIAA,45735,037829257,admin@tiaa.com
96,CHS,45590,196706692,admin@chs.com
97,Bristol-Myers Squibb,45006,996301454,admin@bristol-myerssquibb.com
98,Dow Chemical Company,44622,682620941,admin@dowchemicalcompany.com


In [73]:
df = df.drop(columns=['Industry', 'Rank','Employees', 'Headquarters', 'Revenue growth'])

In [75]:
df.to_csv("sponsors.csv", index=False)

In [129]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = 'https://en.wikipedia.org/wiki/List_of_Nobel_laureates'  
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find('table', {'class': 'wikitable'})

headers = ['Year']
for th in table.find_all('th')[1:7]:
    headers.append(th.get_text(strip=True))

rows = []
for row in table.find_all('tr')[1:]:
    cols = row.find_all('td')[:6] 
    if cols:
        year = row.find('th').get_text(strip=True)  
        cols_text = [year] + [col.get_text(strip=True) for col in cols]  
        rows.append(cols_text)

df = pd.DataFrame(rows, columns=headers)

df['Year'] = df['Year'].astype(int)
df = df[df['Year'] >= 2000]
df['Economics'] = df['Prize in Economic Sciences[13][a]']
df['Medicine'] = df["Physiologyor Medicine"]
df = df.drop(columns=['Prize in Economic Sciences[13][a]',"Physiologyor Medicine"])

records = []

for index, row in df.iterrows():
    for department in ['Physics', 'Chemistry', 'Literature', 'Peace', 'Economics', 'Medicine']:
        first_name = row[department].split(';')[0].strip()
        records.append({'Name': first_name, 'Year': row['Year'], 'Department': department})

df = pd.DataFrame(records)
df.to_csv('researcher.csv', index=False)

In [137]:
df

Unnamed: 0,Name,Year,Department,email,phone,first_name,last_name
0,Jack Kilby,2000,Physics,info@jackkilby.com,(783) 029-315,Jack,Kilby
1,Alan J. Heeger,2000,Chemistry,info@alanj.heeger.com,(819) 305-012,Alan,Heeger
2,Gao Xingjian,2000,Literature,info@gaoxingjian.com,(270) 505-909,Gao,Xingjian
3,Kim Dae-jung,2000,Peace,info@kimdae-jung.com,(102) 843-331,Kim,Dae-jung
4,James Heckman,2000,Economics,info@jamesheckman.com,(825) 791-540,James,Heckman
...,...,...,...,...,...,...,...
145,David Baker,2024,Chemistry,info@davidbaker.com,(287) 927-361,David,Baker
146,Han Kang,2024,Literature,info@hankang.com,(840) 577-652,Han,Kang
147,Nihon Hidankyo,2024,Peace,info@nihonhidankyo.com,(416) 499-089,Nihon,Hidankyo
148,Daron Acemoglu,2024,Economics,info@daronacemoglu.com,(430) 640-662,Daron,Acemoglu


In [136]:
df['email'] = ['info@' + name.replace(' ', '').lower() + '.com' for name in df['Name']]
df['phone'] = [fake.numerify(text='#########') for _ in range(len(df))]  # 9 digits]
df['phone'] = df['phone'].apply(lambda x: f"({x[:3]}) {x[3:6]}-{x[6:]}")
df['first_name'] = df['Name'].apply(lambda x: x.split()[0])
df['last_name'] = df['Name'].apply(lambda x: x.split()[-1])
