In [1]:
import praw
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from os import getenv

try:
    from dotenv import load_dotenv
    load_dotenv(".env", encoding='utf8')
    print("Environment variables loaded!")
except Exception as e:
    print(e)

Environment variables loaded!


# Data Extraction

In [2]:
reddit = praw.Reddit(client_id=getenv("CLIENT_ID"), client_secret=getenv("CLIENT_SECRET"), user_agent="praw_scraper_1.0")

In [3]:
subreddit = reddit.subreddit("PTOrdenado")

In [4]:
df = pd.DataFrame(columns=["Title", "Age", "Sex", "Experience", "Education", "Labor Hours", "Total Salary", "Salary"])

In [5]:
regex_roles = re.compile(r"data|dados|machine learning|\sai^\w|\sia^\w|inteligência artificial|artificial intelligence")
regex_age_str = re.compile(r"Idade:.*")
regex_sex = re.compile(r"\([MF]\)|\s[FM]")
regex_xp_str = re.compile(r"Experiência profissional :.*")
regex_education = re.compile(r"Formação académica:.*")
regex_hours = re.compile(r"Horas de trabalho:.*")
regex_total = re.compile(r"Salário bruto.*")
regex_salary = re.compile(r"Salário líquido.*")


for i, sub in enumerate(subreddit.new(limit=None)):  
    if sub.link_flair_text == "IT / Programação" and regex_roles.search(sub.title.lower()):
        # Age and Sex
        age_str = regex_age_str.search(sub.selftext)
        if age_str:
            age = re.search(r"\d+", age_str.group(0))
            if age:
                age = int(age.group(0))
            else:
                age = np.nan
        
            # Sex
            sex = np.nan
            sex_str =  regex_sex.search(age_str.group(0))
            if sex_str:
                sex = "M" if "M" in sex_str.group(0) else "F"
        else:
            age = np.nan
            sex = np.nan
        
        # years of experience
        xp_str = regex_xp_str.search(sub.selftext)

        if xp_str:
            xp = re.search("\d+", xp_str.group(0))
            if xp:
                xp = float(xp.group(0))
        else:
            xp = np.nan
            
        # Education: Bachelors or Masters degree
        edu = regex_education.search(sub.selftext)
        mat = "No"
        if edu:
            tmp_edu = edu.group(0).lower()
            if "licenciatura" in tmp_edu:
                edu = "BsC"
            elif "mestrado" in tmp_edu:
                edu = "MsC"
            else:
                edu = np.nan

            # If it is a degree in math and/or statistics
            print(tmp_edu)
            if edu != np.nan and ("matemática" in tmp_edu or "estatística" in tmp_edu):
                mat = "Yes"
        else:
            edu = np.nan
        
        # Labor hours
        h = regex_hours.search(sub.selftext)
        hours = np.nan
        if h:
            hours = re.search(r"\d+", h.group(0))
            if hours:
                hours = float(hours.group(0))
            else:
                hours = np.nan
        
        # salary and total salary per month
        total = regex_total.search(sub.selftext)
        if total:
            total_sal = re.search("\d+", total.group(0))
            if total_sal:
                total_sal = float(total_sal.group(0))
            else:
                total_sal = np.nan
        else:
            total_sal = np.nan
        
        sal = regex_salary.search(sub.selftext)
        salary = np.nan
        if sal:
            salary = re.search("\d+", sal.group(0))
            if salary:
                salary = float(salary.group(0))
            else:
                salary = np.nan
        
        df_tmp = pd.DataFrame({
            "Title": [sub.title,],
            "Age": [age,],
            "Sex": [sex,],
            "Labor Hours": [hours,], 
            "Education": [edu,], 
            "Mathematics": [mat,],
            "Experience": [xp,], 
            "Total Salary": [total_sal,], 
            "Salary": [salary,]
        })
        df = pd.concat([df, df_tmp], ignore_index=True)

  rmação académica: mestrado em engenharia informática
formação académica: mestrado engenharia informática 


  df = pd.concat([df, df_tmp], ignore_index=True)


formação académica: **mestrado em data analytics**
formação académica: mestrado em engenharia informática
formação académica: licenciatura matemática (a fazer mestrado ciência computadores) 
formação académica: mestrado gestão empresas
formação académica: **mestrado em engenharia informática**
  rmação académica: curso nível 5 em redes e sistemas informáticos
formação académica: **licenciatura + pós graduação (tirada enquanto trabalhava)**  
formação académica: **mestrado em engenharia informática**
  rmação académica: **mestrado em engenharia** (com alguma exposição a software, ai e hardware)
formação académica: mestrado em engenharia electrotécnica
formação académica: 12º ano
formação académica: licentiatura em matemática aplicada, mestrado em data science  
formação académica: mestrado integrado em matemática aplicada 
formação académica: mestrado em engenharia informática
formação académica:** mestrado em engenharia informática | pós graduação em dados para o negócio
formação acadé

# Data analysis

**Title**: Name of the job

**Age**: Age of the person

**Sex**: Sex of the person

**Experience**: Years of experience

**Education**: If has Bachelors or Masters degree

**Labor Hours**: Hours of work per week

**Total Salary**: Salary without taxes, in euros

**Salary**: Salary with taxes applied, in euros

**Mathematics**: If the BsC or MsC is in mathematics or not (it's interesting for me)

In [6]:
df

Unnamed: 0,Title,Age,Sex,Experience,Education,Labor Hours,Total Salary,Salary,Mathematics
0,Estagiário Análise de Dados,23.0,M,1.0,MsC,40.0,1150.0,1050.0,No
1,Engenheiro de machine learning,24.0,M,1.0,MsC,40.0,2050.0,1611.0,No
2,Data Analyst,37.0,F,4.0,MsC,40.0,2150.0,1650.0,No
3,Analista de Dados,26.0,M,3.0,MsC,40.0,2600.0,1800.0,No
4,Data Scientist Luxemburgo,24.0,M,3.0,BsC,40.0,4460.0,3500.0,Yes
5,Data Analyst,28.0,F,2.0,MsC,40.0,1620.0,1220.0,No
6,Data Engineer,,,,,,3150.0,2000.0,No
7,Engenheiro de Dados,26.0,M,3.0,MsC,40.0,2000.0,1523.0,No
8,Data Center Engineer,25.0,M,2.0,,40.0,2160.0,1550.0,No
9,Data Engineer,26.0,,4.0,BsC,40.0,5667.0,4233.0,No


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Title         19 non-null     object 
 1   Age           17 non-null     object 
 2   Sex           14 non-null     object 
 3   Experience    16 non-null     float64
 4   Education     16 non-null     object 
 5   Labor Hours   18 non-null     float64
 6   Total Salary  19 non-null     float64
 7   Salary        19 non-null     float64
 8   Mathematics   19 non-null     object 
dtypes: float64(4), object(5)
memory usage: 1.5+ KB


### Notes
Sometimes salaries are not in decimal representation but in "k" representation, e.g **2k**
Since salaries doesn't pass tens of thousands at maximum, we can verify if salary's remainder by 100 is equal to salary, if yes multiply by 1000, because it didn't pass 100 and anything below 100 can't be a salary represented in decimal.

In [8]:
df["Salary"] = df["Salary"].apply(lambda x: x * 1000 if x % 100 == x else x)
df["Total Salary"] = df["Total Salary"].apply(lambda x: x * 1000 if x % 100 == x else x)

In [9]:
df

Unnamed: 0,Title,Age,Sex,Experience,Education,Labor Hours,Total Salary,Salary,Mathematics
0,Estagiário Análise de Dados,23.0,M,1.0,MsC,40.0,1150.0,1050.0,No
1,Engenheiro de machine learning,24.0,M,1.0,MsC,40.0,2050.0,1611.0,No
2,Data Analyst,37.0,F,4.0,MsC,40.0,2150.0,1650.0,No
3,Analista de Dados,26.0,M,3.0,MsC,40.0,2600.0,1800.0,No
4,Data Scientist Luxemburgo,24.0,M,3.0,BsC,40.0,4460.0,3500.0,Yes
5,Data Analyst,28.0,F,2.0,MsC,40.0,1620.0,1220.0,No
6,Data Engineer,,,,,,3150.0,2000.0,No
7,Engenheiro de Dados,26.0,M,3.0,MsC,40.0,2000.0,1523.0,No
8,Data Center Engineer,25.0,M,2.0,,40.0,2160.0,1550.0,No
9,Data Engineer,26.0,,4.0,BsC,40.0,5667.0,4233.0,No
