<a href="https://colab.research.google.com/github/yakubszatkowski/100_days_python/blob/master/pulls/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This project that scraps data from [Payscale](https://www.payscale.com/college-salary-report/majors-that-pay-you-back/bachelors) first page by using BeautifulSoup, creates a dataframe and picks profession that are:
* Safest options (highest starting pay)
* Highest potential options (highest mid career pay)
* Lowest risk option (smallest difference between starting and mid career pay of the first 100 highest early career pay)

In [None]:
# importing libaries
from bs4 import BeautifulSoup
import pandas as pd
import requests

In [None]:
# requesting data from all 34 pages and parsing it
website_content = ''
for num in range(1,35):
  URL = f'https://www.payscale.com/college-salary-report/majors-that-pay-you-back/bachelors/page/{num}'
  response = requests.get(URL)
  website_content += response.text

soup = BeautifulSoup(website_content, 'html.parser')

In [None]:
# creating a lists of Major titles, Early Career Pay, Mid Career Pay, then creating dictionary out of them
table_rows = soup.select(selector='.data-table .data-table__row')

list_major_titles = []
list_early_career_pay = []
list_mid_career_pay = []

for table_row in table_rows:
  table_row_data = table_row.select(selector='.data-table__value')
  list_major_titles.append(table_row_data[1].getText())
  list_early_career_pay.append(int(table_row_data[3].getText().replace('$', '').replace(',','')))
  list_mid_career_pay.append(int(table_row_data[4].getText().replace('$', '').replace(',','')))


data_dict = {
    'Major title': list_major_titles,
    'Early career pay': list_early_career_pay,
    'Mid career pay': list_mid_career_pay,
}

data_dict

In [None]:
# creating a dataframe out of previously created dictionary
df = pd.DataFrame(data_dict)
df

Unnamed: 0,Major title,Early career pay,Mid career pay
0,Petroleum Engineering,93200,187300
1,Operations Research & Industrial Engineering,84800,170400
2,Electrical Engineering & Computer Science (EECS),108500,159300
3,Interaction Design,68300,155800
4,Public Accounting,59800,147700
...,...,...,...
822,Outdoor Education,37400,46300
823,Early Childhood Education,36100,45400
824,Mental Health,36900,45000
825,Medical Assisting,36000,44800


# Safest options (highest starting pay)

In [None]:
safest_options_careers = df.sort_values('Early career pay', ascending=False)
safest_options_careers.head()

Unnamed: 0,Major title,Early career pay,Mid career pay
2,Electrical Engineering & Computer Science (EECS),108500,159300
75,Physician Assistant Studies,95900,118500
0,Petroleum Engineering,93200,187300
1,Operations Research & Industrial Engineering,84800,170400
158,Nuclear Engineering Technology (NET),83500,106000


# Highest potential options (highest mid career pay)

In [None]:
highest_potential_careers = df.sort_values('Mid career pay', ascending=False)
highest_potential_careers.head()

Unnamed: 0,Major title,Early career pay,Mid career pay
0,Petroleum Engineering,93200,187300
1,Operations Research & Industrial Engineering,84800,170400
2,Electrical Engineering & Computer Science (EECS),108500,159300
3,Interaction Design,68300,155800
4,Public Accounting,59800,147700


# Lowest risk option (smallest difference between starting and mid career pay)

In [None]:
lowest_risk_careers = highest_potential_careers[0:100]
spread_col = lowest_risk_careers['Mid career pay'] - lowest_risk_careers['Early career pay']
lowest_risk_careers.insert(1, 'Spread', spread_col)
lowest_risk_careers

Unnamed: 0,Major title,Spread,Early career pay,Mid career pay
0,Petroleum Engineering,94100,93200,187300
1,Operations Research & Industrial Engineering,85600,84800,170400
2,Electrical Engineering & Computer Science (EECS),50800,108500,159300
3,Interaction Design,87500,68300,155800
4,Public Accounting,87900,59800,147700
...,...,...,...,...
95,Asian Studies,67400,47700,115100
96,Structural Engineering (SE),47300,67800,115100
97,Industrial Distribution,51400,63600,115000
98,Management Information Systems (MIS),49100,65000,114100
