## Web Scraping with Beautiful Soup

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

df = pd.read_excel (r'Gsearch_all.xlsx')
urls = df['links'].tolist()
df.head()

Unnamed: 0.1,Unnamed: 0,Id,NAME,WEBSITE,Keywords,keys,titles,links
0,0,1,FORTUM OYJ,www.fortum.com,covid virus korona pandemic pandemia koronavirus,www.fortum.com covid virus korona pandemic pan...,Coronavirus Updates | Fordham,https://www.fordham.edu/coronavirus
1,1,2,NOKIA OYJ,www.nokia.com,covid virus korona pandemic pandemia koronavirus,www.nokia.com covid virus korona pandemic pand...,Network traffic insights in the time of COVID-...,https://www.nokia.com/blog/network-traffic-ins...
2,2,3,AALLON GROUP OYJ,www.aallongroup.fi,covid virus korona pandemic pandemia koronavirus,www.aallongroup.fi covid virus korona pandemic...,Tämä vaikuttaa todella potentiaaliselta yrityk...,https://www.shareville.fi/osakkeet/203-web-gro...
3,3,4,NESTE OYJ,www.neste.com,covid virus korona pandemic pandemia koronavirus,www.neste.com covid virus korona pandemic pand...,"COVID Live Update: 188,089,101 Cases and 4,055...",https://www.worldometers.info/coronavirus/
4,4,5,KESKO OYJ,www.kesko.fi,covid virus korona pandemic pandemia koronavirus,www.kesko.fi covid virus korona pandemic pande...,COVID-19 pandemic in Finland - Wikipedia,https://en.wikipedia.org/wiki/COVID-19_pandemi...


In [2]:
itemlist = []


i = 1
for url in urls:
    print(f'{i} scraping {url}')
    i += 1
    
    try:
        source = requests.get(url, timeout=10.0).text
        soup = BeautifulSoup(source, 'lxml')
        
        page_text_list = [item.text for item in soup.find_all('p')]

        page_text = " ".join(page_text_list)

        itemlist.append(page_text)
        print('url scraped')
        
    except requests.Timeout:
        print('time error')
        itemlist.append(None)
    except requests.exceptions.SSLError:
        print('SSL error')
        itemlist.append(None)
    except requests.TooManyRedirects:
        print('Too many requests')
        itemlist.append(None)
    except Exception as e:
        print (f'An error was thrown: {e}')
        itemlist.append(None)
    


df.loc[: , 'text'] = itemlist


1 scraping https://www.fordham.edu/coronavirus
url scraped
2 scraping https://www.nokia.com/blog/network-traffic-insights-in-the-time-of-covid-19-june-4-update/
url scraped
3 scraping https://www.shareville.fi/osakkeet/203-web-group-ab/kommentit/tama-vaikuttaa-todella-potentiaaliselta-yritykselta-ja-arvos-90967292
url scraped
4 scraping https://www.worldometers.info/coronavirus/
url scraped
5 scraping https://en.wikipedia.org/wiki/COVID-19_pandemic_in_Finland
url scraped
6 scraping https://www.nokia.com/blog/network-traffic-insights-in-the-time-of-covid-19-june-4-update/
url scraped
7 scraping https://www.kone.com/en/news-and-insights/stories/a-message-from-KONE-on-coronavirus.aspx
url scraped
8 scraping https://www.americasquarterly.org/article/the-pandemic-and-organized-crime-in-latin-america-ten-unknowns/
url scraped
9 scraping https://www.fitchratings.com/research/corporate-finance/coronavirus-impact-on-emea-packaging-driven-by-end-markets-22-05-2020
url scraped
10 scraping https:/

url scraped
65 scraping https://www.latimes.com/topic/covid-19-pandemic
url scraped
66 scraping https://daten-quadrat.de/index.php?lng=en&mod=3
url scraped
67 scraping https://www.toyota.com/toyota-covid-19-response/
url scraped
68 scraping https://link.springer.com/article/10.1007/s00414-021-02613-z
url scraped
69 scraping https://www.fitchratings.com/research/corporate-finance/coronavirus-impact-on-emea-packaging-driven-by-end-markets-22-05-2020
url scraped
70 scraping https://www.unicef.org/documents/technical-note-protection-children-coronavirus-disease-2019-covid-19-pandemic
url scraped
71 scraping https://www.khshp.fi/ajankohtaista-koronaviruksesta/
url scraped
72 scraping https://triblive.com/local/valley-news-dispatch/palmer-pharmacy-in-west-deer-wants-to-get-children-reading-more/?jwsource=cl
url scraped
73 scraping https://www.nokia.com/blog/network-traffic-insights-in-the-time-of-covid-19-june-4-update/
url scraped
74 scraping https://www.projektimaailma.fi/files/1030/Projek

url scraped
151 scraping https://www.dss.virginia.gov/benefit/pebt.cgi
url scraped
152 scraping https://www.usatoday.com/story/news/world/2020/03/18/coronavirus-did-president-trumps-decision-disband-global-pandemic-office-hinder-response/5064881002/
url scraped
153 scraping https://www.researchgate.net/publication/341184499_A_Commentary_on_Psychological_Factors_Affecting_Pro-Social_Behaviors_What_Can_We_Do_to_Increase_Compliance_with_the_Regulations_of_Physical_Distancing_During_the_COVID-19_Pandemic
url scraped
154 scraping http://www.caislas.name/blog/?paged=2
url scraped
155 scraping https://portal.ct.gov/Office-of-the-Governor/News/Press-Releases/2020/06-2020/Governor-Lamont-Announces-Plans-for-the-2020-21-School-Year-Amid-the-Ongoing-COVID19-Pandemic
url scraped
156 scraping https://en.wikipedia.org/wiki/Wikipedia:Database_reports/Deaths_from_the_COVID-19_pandemic
url scraped
157 scraping https://www.episcopalrelief.org/press-resources/press-releases/2021-press-releases-press-rele

url scraped
227 scraping https://www.sappi.com/sappi-donates-essential-items-and-products-for-covid-19-relief
url scraped
228 scraping https://www.marketresearch.com/Timetric-v3917/Salcomp-GMPG-Guigang-Adapters-Chargers-10955754/
url scraped
229 scraping https://www.who.int/health-topics/coronavirus
url scraped
230 scraping https://www.bloomberg.com/news/articles/2020-09-25/an-american-ceo-living-in-sweden-has-a-covid-lesson-to-share
url scraped
231 scraping https://www.energy.gov/covid/coronavirus-doe-response
url scraped
232 scraping https://kidshealth.org/en/parents/coronavirus-questions-answers.html
url scraped
233 scraping https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7245651/
url scraped
234 scraping https://www.croplife.com/iron/agco-powers-multi-million-investment-in-engine-manufacturing-progresses/
url scraped
235 scraping http://www.wbiw.com/2021/01/05/red-cross-teams-with-nfl-to-urge-donations-during-national-blood-donor-month/
url scraped
236 scraping https://www.hindustanti

url scraped
297 scraping https://www.halton.com/contact-us/covid-19-2/
url scraped
298 scraping http://www.rederiabeckero.ax/files/ekon_oversikt_30_6_2021_slutlig.pdf
url scraped
299 scraping https://www.esrb.europa.eu/pub/pdf/reports/esrb.reports210216_FSI_covid19~cf3d32ae66.en.pdf
url scraped
300 scraping https://www.peikko.com/news/gaining-competitive-advantage-during-the-pandemic-through-a-diversified-supply-chain-and-a-customer-focused-approach/
url scraped
301 scraping https://www.energiasuomi.fi/media/4922/focus_14-2020_fi.pdf
url scraped
302 scraping https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7245651/
url scraped
303 scraping https://www.nutraingredients.com/Article/2017/01/03/Raisio-chief-resigns-Time-for-new-adventures-after-36-quarters-of-continuous-growth
url scraped
304 scraping https://www.wfp.org/news/wfp-chief-warns-hunger-pandemic-covid-19-spreads-statement-un-security-council
url scraped
305 scraping https://www.agnicoeagle.com/English/sustainability/Our-COVID-19-Re

url scraped
372 scraping https://chromaflo.com/blog/
url scraped
373 scraping https://manometcurrent.com/latest-update-2021-global-agricultural-films-market-with-covid-19-impact-analysis-top-key-players-rani-plast-huadun-british-polythene-industries-bpi-polypak/
url scraped
374 scraping https://capebretonpartnership.com/covid-19-resources/webinar-series/
url scraped
375 scraping https://www.fiskarsgroup.com/investors/fiskars-investment/impact-covid-19-pandemic
url scraped
376 scraping https://www.ksbw.com/article/monterey-county-remains-solidly-in-the-purple/34419132
url scraped
377 scraping https://www.santafenewmexican.com/news/coronavirus/new-mexico-bans-nonessential-surgeries-governor-cites-strain-on-hospitals/article_bc6023dc-3b1a-11eb-8a99-6b9437e6299a.html
url scraped
378 scraping https://www.rfdtv.com/story/44002312/Brown-Sugar-Market
url scraped
379 scraping https://www.unilever.com/news/press-releases/2020/helping-to-protect-lives-and-livelihoods-from-the-covid-19-pandemic.ht

An error was thrown: 
442 scraping https://bromangroup.fi/files/BromanGroupVuosikertomus2020_EN-lowres.pdf
An error was thrown: 
443 scraping https://www.theguardian.com/sport/2020/mar/24/international-olympic-committee-hype-thomas-bach-tokyo-games
url scraped
444 scraping https://pubmed.ncbi.nlm.nih.gov/33138722/
url scraped
445 scraping https://www.cnn.com/world/live-news/coronavirus-pandemic-vaccine-updates-04-14-21/index.html
url scraped
446 scraping https://www.polar.com/blog/year-in-review-how-2020-has-changed-our-health/
url scraped
447 scraping https://home.kuehne-nagel.com/-/knowledge/updates-corona-virus-covid-19
url scraped
448 scraping https://www.avanttecno.com/download_file/view/3802/14321
An error was thrown: 
449 scraping https://www.southampton.ac.uk/publicpolicy/support-for-policymakers/policy-projects/perso.page
url scraped
450 scraping https://lancasteronline.com/business/local_business/owners-of-black-olive-family-diner-in-columbia-to-reopen-former-cloister-restaur

url scraped
520 scraping https://pubmed.ncbi.nlm.nih.gov/33041724/
url scraped
521 scraping https://www.pwc.com/us/en/industries/health-industries/library/hri-insight-consumer-health-behavior-and-covid-19-pandemic.html
url scraped
522 scraping https://weartv.com/news/coronavirus
url scraped
523 scraping https://en.wikipedia.org/wiki/COVID-19_pandemic_in_Finland
url scraped
524 scraping https://www.axpo.com/us/en/about-us/media-and-politics/media-releases.detail.html/media-releases/2020/strong-operating-performance---but-earnings-blunted-by-financial.html
url scraped
525 scraping https://dc602r66yb2n9.cloudfront.net/pub/web/attachments/publications/Nokian+Tyres+Financial+Review+2020.pdf
An error was thrown: 
526 scraping https://www.finnair.com/us-en/flight-information/travel-updates
url scraped
527 scraping https://www.hcltech.com/
url scraped
528 scraping https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7376362/
url scraped
529 scraping https://www.bp.com/
url scraped
530 scraping https:/

url scraped
595 scraping https://www.interregeurope.eu/primaas/news/news-article/10737/stakeholder-interview-engineer-karhula-kaisa/
url scraped
596 scraping https://kotkamills.com/world-cup-levi-is-kotkamills-newest-partner-in-responsible-sporting-events/
url scraped
597 scraping https://www.businesswire.com/news/home/20200814005039/en/Insights-on-the-Global-Wind-Turbine-Gearbox-Market-2020-2024-COVID-19-Analysis-Drivers-Restraints-Opportunities-and-Threats-Technavio
url scraped
598 scraping https://www.reuters.com/article/brief-bravida-holding-q1-ebita-rises-to/brief-bravida-holding-q1-ebita-rises-to-sek-272-million-idUSFWN2CP1TW
url scraped
599 scraping https://apnews.com/article/donald-trump-entertainment-coronavirus-pandemic-8f586d687ab332777a7a059457ff818e
url scraped
600 scraping https://www.phillipsmedisize.com/blog/wearable-injectors-to-support-connected-health/
url scraped
601 scraping https://fi.linkedin.com/in/kari-s-lankinen-0b000116
url scraped
602 scraping http://cityofm

url scraped
676 scraping https://www.brown-forman.com/brown-forman-corporation-today-announced-a-donation-of-1-million-to-covid-19-response-funds/
url scraped
677 scraping https://corporate.thermofisher.com/us/en/index/newsroom/covid-19-resource-hub.html
url scraped
678 scraping https://www.solita.fi/en/solita-to-implement-a-mobile-application-for-covid-19-contact-tracing/
url scraped
679 scraping https://www.usatoday.com/story/news/politics/2020/04/28/trump-defends-diamond-and-silk-despite-coronavirus-conspiracies/3039249001/
url scraped
680 scraping https://www.solita.fi/en/solita-to-implement-a-mobile-application-for-covid-19-contact-tracing/
url scraped
681 scraping https://vermontbiz.com/news/2021/march/06/usda-adds-35-million-value-added-producer-grant-program-support-covid-19-relief
url scraped
682 scraping https://www.capgemini.com/fi-en/research/covid-19-and-the-financial-services-consumer/
url scraped
683 scraping https://www.erillisverkot.fi/en/coronavirus-highlights-the-nee

url scraped
752 scraping https://www.workdesign.com/2021/04/take-a-tour-of-reaktor-amsterdams-three-story-canalside-office/
url scraped
753 scraping https://www.onemedical.com/coronavirus/
url scraped
754 scraping https://www.iai.it/en/pubblicazioni/covid-19-palestine-pandemic-face-settler-colonial-erasure
url scraped
755 scraping https://www.cdc.gov/library/covid19/01292021_covidupdate.html
url scraped
756 scraping https://pubmed.ncbi.nlm.nih.gov/32242340/
url scraped
757 scraping https://ysaatio.fi/en/h1-2020
url scraped
758 scraping https://www.globenewswire.com/en/news-release/2020/10/29/2116530/0/en/Teleste-1-9-2020-Net-sales-and-result-decreased-due-to-the-covid-19-pandemic-and-the-technological-transformation-of-access-networks-progress-made-with-the-Cableway-divestment.html
url scraped
759 scraping https://www.reuters.com/article/us-skf-results/swedens-skf-beats-forecasts-demand-outlook-uncertain-on-pandemic-idUSKCN2250GG
url scraped
760 scraping https://www.manpowergroup.com/m

url scraped
827 scraping https://newsroom.medtronic.com/news-releases/news-release-details/medtronic-provides-update-covid-19-pandemic-response-and-impact
url scraped
828 scraping https://www.siili.com/stories/quo-vadis-digital-director-where-is-your-modernization-project-going
url scraped
829 scraping https://www.trustpilot.com/review/www.sembo.se
url scraped
830 scraping https://www.elomatic.com/fi/assets/files/publications/2020/top-engineer/Top-Engineer-2020-1.pdf
An error was thrown: 
831 scraping https://govstatus.egov.com/kycovid19
url scraped
832 scraping https://www.kampcollectionhotels.com/en/news
url scraped
833 scraping https://www.kirjavalitys.fi/en/kirjavalitys-has-published-new-webpages-2/
url scraped
834 scraping https://www.health.pa.gov/topics/disease/coronavirus/Pages/Coronavirus.aspx
url scraped
835 scraping https://www.oecd.org/coronavirus/policy-responses/beyond-containment-health-systems-responses-to-covid-19-in-the-oecd-6ab740c0/
url scraped
836 scraping https://

In [3]:
#df.to_pickle('scraped_text.pkl')

display(df)

Unnamed: 0.1,Unnamed: 0,Id,NAME,WEBSITE,Keywords,keys,titles,links,text
0,0,1,FORTUM OYJ,www.fortum.com,covid virus korona pandemic pandemia koronavirus,www.fortum.com covid virus korona pandemic pan...,Coronavirus Updates | Fordham,https://www.fordham.edu/coronavirus,View All Fordham Community Guidelines We track...
1,1,2,NOKIA OYJ,www.nokia.com,covid virus korona pandemic pandemia koronavirus,www.nokia.com covid virus korona pandemic pand...,Network traffic insights in the time of COVID-...,https://www.nokia.com/blog/network-traffic-ins...,Societies across the world are preparing to re...
2,2,3,AALLON GROUP OYJ,www.aallongroup.fi,covid virus korona pandemic pandemia koronavirus,www.aallongroup.fi covid virus korona pandemic...,Tämä vaikuttaa todella potentiaaliselta yrityk...,https://www.shareville.fi/osakkeet/203-web-gro...,
3,3,4,NESTE OYJ,www.neste.com,covid virus korona pandemic pandemia koronavirus,www.neste.com covid virus korona pandemic pand...,"COVID Live Update: 188,089,101 Cases and 4,055...",https://www.worldometers.info/coronavirus/,The coronavirus COVID-19 is affecting\n220 c...
4,4,5,KESKO OYJ,www.kesko.fi,covid virus korona pandemic pandemia koronavirus,www.kesko.fi covid virus korona pandemic pande...,COVID-19 pandemic in Finland - Wikipedia,https://en.wikipedia.org/wiki/COVID-19_pandemi...,\n The COVID-19 pandemic in Finland is part of...
...,...,...,...,...,...,...,...,...,...
867,995,1081,THE SWITCH DRIVE SYSTEMS OY,www.theswitch.com,covid virus korona pandemic pandemia koronavirus,theswitch.com covid korona virus pandemi,The Switch - Advancing the world with electric...,https://theswitch.com/,
868,996,1082,SUOMEN 3M OY,www.3m.fi,covid virus korona pandemic pandemia koronavirus,3m.fi covid korona virus pandemi,COVID-19 (Novel Coronavirus) | How 3M is Respo...,https://www.3m.com/3M/en_US/company-us/coronav...,
869,997,1083,FCG FINNISH CONSULTING GROUP OY,www.fcg.fi,covid virus korona pandemic pandemia koronavirus,fcg.fi covid korona virus pandemi,COVID-19 highlighted the importance of flexibi...,https://www.fcg.fi/en/blog/covid-19-highlighte...,"In February this year, I was in Hargeisa. I ..."
870,998,1084,PANKABOARD OYJ,www.pankaboard.fi,covid virus korona pandemic pandemia koronavirus,pankaboard.fi covid korona virus pandemi,More news,https://www.valmet.com/media/news/,


In [4]:
short_df = df.drop(columns=['Unnamed: 0','NAME', 'WEBSITE', 'Keywords', 'keys', 'titles', 'links'])
short_df.head()

Unnamed: 0,Id,text
0,1,View All Fordham Community Guidelines We track...
1,2,Societies across the world are preparing to re...
2,3,
3,4,The coronavirus COVID-19 is affecting\n220 c...
4,5,\n The COVID-19 pandemic in Finland is part of...


In [5]:
short_df['text'] = short_df['text'].str[:5000]

In [8]:
short_df

Unnamed: 0,Id,text
0,1,View All Fordham Community Guidelines We track...
1,2,Societies across the world are preparing to re...
2,3,
3,4,The coronavirus COVID-19 is affecting\n220 c...
4,5,\n The COVID-19 pandemic in Finland is part of...
...,...,...
867,1081,
868,1082,
869,1083,"In February this year, I was in Hargeisa. I ..."
870,1084,


In [9]:
import pyarrow.feather as feather

feather.write_feather(short_df, 'new_batch.ft')

In [9]:
#import feather
#short_df.to_feather('scraped_text.ft')

#store = pd.HDFStore('scrapedtext.h5')
#store['shor_df'] = short_df  # save it
#store['df']  # load it

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['text'], dtype='object')]

  exec(code_obj, self.user_global_ns, self.user_ns)


MemoryError: 