In [46]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import datetime
from IPython.display import HTML

# Read the file
file = pd.read_json('../raw_data.json')
series = pd.Series(file['data'])

# convert json to dataframe
# the object contains all posts data
posts_df = pd.json_normalize(series)

# for development, obtain only first 100 results, for better performance
# posts_df = posts_df[:100]

print("Number of posts: ", len(posts_df))

Number of posts:  3768


In [49]:
# salaries DataFrame containing extracted data
salaries_df = pd.DataFrame({'name': [], 'salary': [], 'currency': [], 'date': [], 'period': [], 'url': []})

# keywords, which we search in the post content
salary_keywords = ['Zarobki', 'zarobki', 'ZAROBKI']

currencies ={
    'PLN' : ['ZŁ','ZL', 'Zł', 'zł', 'zl', 'złotych', 'zlotych'],
    'USD' : ['USD','Usd', 'usd', '$', 'DOLARÓW', 'DOLAROW', 'Dolarów', 'Dolarow', 'dolarów', 'dolarow'],
    'EUR' : ['EURO', 'Euro', 'euro', 'EUR', 'Eur', 'eur', '€']
}

thousands = ['k', 'tys', 'tyś', 'tysięcy', 'tysiecy', 'tysiące', 'tysiace']

def extract_salaries(post):
    '''
    Find salaries info in the post
    Args: post, row from posts_df DataFrame
    Returns: new DF object containing extracted data
    '''
    
    soup = BeautifulSoup(post['post_content'], 'html.parser')
    
    # remove blockquotes and avoid repetition
    for blockquote in soup.find_all('blockquote'):
        blockquote.decompose()
        
    # get clean text without html tags
    clean_text = soup.get_text()
    
    result = pd.DataFrame(columns = salaries_df.columns)
    
    
    new_row = {'name': post['author'], 'salary': 0, 'currency': None, 'date': '' , 'url': post['url']}
    
    match_found = False
    
    for line in clean_text.split('\n'):
        for keyword in salary_keywords:
            # check if salary is present
            if keyword in line:
                
                # regexp rules
                full_salary_match = re.search(r'\d+', line)
                short_salary_match = re.search(r'(\d+[.,]?\d*)\s*k', line)
                
                
                # find salary and convert it to int
                if(full_salary_match):
                    new_row['salary'] = int(full_salary_match.group(0))
                    match_found = True
                    
                elif (short_salary_match):
                    value = short_salary_match.group(0)
                
                    # convert , to . for Python float format
                    if "," in value:
                        value = value.replace(',', '.')
                        
                    # convert to full value, e.g. 6,5k -> 6500
                    if "k" in value:
                        value = value.replace('k', "")
                        value = float(value) * 1000
                        value = int(value)
                    
                    
                    new_row['salary'] = value
                    match_found = True
                    
                # search for currency of salary
                currency_match_found = False
                for currency, currencies_list in currencies.items():
                    #stop searching if match is already found
                    if currency_match_found:
                        break
                    for c in currencies_list:
                        if c in line:
                            new_row['currency'] = currency
                            currency_match_found = True
                            match_found = True
                
                # set default currency to PLN and mark it as (PLN)
                # to distinguish alleged values
                if new_row['currency'] == None:
                    new_row['currency'] = "(PLN)"

                # convert datetime
                try:
                    new_row['date'] = pd.to_datetime(post['date'])
                except:
                    pass
                
    if match_found:
        result.loc[len(salaries_df)] = new_row
                    
    return result
    

# iterate over all posts
# analyze every post
for index, post in posts_df.iterrows():
    
    post_salaries = extract_salaries(post)
    
    salaries_df = pd.concat([salaries_df, post_salaries], ignore_index = True)

In [50]:
print('Processed posts: ', len(salaries_df))

output_df = salaries_df

# Display only foreign currencies
# output_df = salaries_df.loc[~salaries_df['currency'].isin(['PLN', '(PLN)'])]

# display only first 10 and last 10 items 
output_df=output_df[-10:]

HTML(output_df.to_html(render_links=True, escape=False))

Processed posts:  2458


Unnamed: 0,name,salary,currency,date,period,url
2448,Hodor,120.0,(PLN),2023-03-23 07:09:00,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1901263#id1901263
2449,EndrjuGolara,25.0,(PLN),2023-03-14 08:02:00,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1899794#id1899794
2450,Skromny_miszcz,6.0,EUR,2023-03-17 13:10:00,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1900303#id1900303
2451,pabyloo,10000.0,PLN,2023-03-27 10:08:00,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1901924#id1901924
2452,soulSki,2.0,PLN,2023-03-30 02:57:00,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1902459#id1902459
2453,Bambo,43.0,(PLN),2023-04-01 12:08:00,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1902984#id1902984
2454,LukaLs,26.0,(PLN),2023-04-18 13:22:00,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1905875#id1905875
2455,SharkSamuraj,2.0,PLN,2023-04-09 16:25:00,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1904496#id1904496
2456,ProgramistaXk,200.0,(PLN),,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1907098#id1907098
2457,mythflame,21.0,(PLN),2023-04-12 13:51:00,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1904930#id1904930
