In [25]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from IPython.display import HTML

# Read the file
file = pd.read_json('../raw_data.json')
series = pd.Series(file['data'])

# convert json to dataframe
# the object contains all posts data
posts_df = pd.json_normalize(series)

# for development, obtain only first 100 results, for better performance
# posts_df = posts_df[:100]

print("Number of posts: ", len(posts_df))

Number of posts:  3768


In [31]:
# salaries DataFrame containing extracted data
salaries_df = pd.DataFrame({'name': [], 'salary': [], 'currency': [], 'date': [], 'period': [], 'url': []})

# keywords, which we search in the post content
salary_keywords = ['Zarobki', 'zarobki', 'ZAROBKI']

currencies ={
    'PLN' : ['ZŁ','ZL', 'Zł', 'zł', 'zl', 'złotych', 'zlotych'],
    'USD' : ['USD','Usd', 'usd', '$', 'DOLARÓW', 'DOLAROW', 'Dolarów', 'Dolarow', 'dolarów', 'dolarow'],
    'EUR' : ['EURO', 'Euro', 'euro', 'EUR', 'Eur', 'eur', '€']
}

thousands = ['k']

def extract_salaries(post):
    '''
    Find salaries info in the post
    Args: post, row from posts_df DataFrame
    Returns: new DF object containing extracted data
    '''
    
    soup = BeautifulSoup(post['post_content'], 'html.parser')
    
    # remove blockquotes and avoid repetition
    for blockquote in soup.find_all('blockquote'):
        blockquote.decompose()
        
    # get clean text without html tags
    clean_text = soup.get_text()
    
    result = pd.DataFrame(columns = salaries_df.columns)
    
    
    new_row = {'name': post['author'], 'salary': 0, 'currency': None, 'url': post['url']}
    
    match_found = False
    
    for line in clean_text.split('\n'):
        for keyword in salary_keywords:
            # check if salary is present
            if keyword in line:
                
                # regexp rules
                full_salary_match = re.search(r'\d+', line)
                short_salary_match = re.search(r'(\d+[.,]?\d*)\s*k', line)
                
                
                # find salary and convert it to int
                if(full_salary_match):
                    new_row['salary'] = int(full_salary_match.group(0))
                    match_found = True
                    
                elif (short_salary_match):
                    value = short_salary_match.group(0)
                
                    # convert , to . for Python float format
                    if "," in value:
                        value = value.replace(',', '.')
                        
                    # convert to full value, e.g. 6,5k -> 6500
                    if "k" in value:
                        value = value.replace('k', "")
                        value = float(value) * 1000
                        value = int(value)
                    
                    
                    new_row['salary'] = value
                    match_found = True
                    
                # search for currency of salary
                currency_match_found = False
                for currency, currencies_list in currencies.items():
                    #stop searching if match is already found
                    if currency_match_found:
                        break
                    for c in currencies_list:
                        if c in line:
                            new_row['currency'] = currency
                            currency_match_found = True
                            match_found = True
                
                # set default currency to PLN and mark it as (PLN)
                # to distinguish alleged values
                if new_row['currency'] == None:
                    new_row['currency'] = "(PLN)"

                
    if match_found:
        result.loc[len(salaries_df)] = new_row
                    
    return result
    

# iterate over all posts
# analyze every post
for index, post in posts_df.iterrows():
    
    post_salaries = extract_salaries(post)
    
    salaries_df = pd.concat([salaries_df, post_salaries], ignore_index = True)

In [32]:
print('Processed posts: ', len(salaries_df))

# Display only foreign currencies
output_df = salaries_df.loc[~salaries_df['currency'].isin(['PLN', '(PLN)'])]

HTML(output_df.to_html(render_links=True, escape=False))

Processed posts:  2458


Unnamed: 0,name,salary,currency,date,period,url
73,Złoty Szczur,3000.0,USD,,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1085070#id1085070
226,Krzywy Orzełee,7000.0,EUR,,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1221836#id1221836
256,UrUr,16.0,EUR,,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1250592#id1250592
306,Tomasz A,2400.0,EUR,,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1268327#id1268327
353,Wielki Kaczor,4100.0,USD,,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1294159#id1294159
391,Krwawy Młot,12.0,EUR,,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1303861#id1303861
450,Tango,3000.0,EUR,,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1332068#id1332068
478,Szalony Kot,36.0,USD,,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1339119#id1339119
607,Joyer,16.0,USD,,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1371097#id1371097
668,kmle,60000.0,USD,,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1381861#id1381861
