In [14]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from IPython.display import HTML

# Read the file
file = pd.read_json('../raw_data.json')
series = pd.Series(file['data'])

# convert json to dataframe
# the object contains all posts data
posts_df = pd.json_normalize(series)

# for development, obtain only first 100 results, for better performance
posts_df = posts_df[:100]

print("Number of posts: ", len(posts_df))

Number of posts:  100


In [15]:
# salaries DataFrame containing extracted data
salaries_df = pd.DataFrame({'name': [], 'salary': [], 'currency': [], 'date': [], 'period': [], 'url': []})

# keywords, which we search in the post content
salary_keywords = ['Zarobki', 'zarobki', 'ZAROBKI']
currency_keywords = ['ZŁ', 'Zł', 'zł', 'USD','Usd', 'usd', '$','EURO', 'Euro', 'euro', 'EUR', 'Eur', 'eur', '€']
thousands = ['k']

def extract_salaries(post):
    '''
    Find salaries info in the post
    Args: post, row from posts_df DataFrame
    Returns: new DF object containing extracted data
    '''
    
    soup = BeautifulSoup(post['post_content'], 'html.parser')
    
    # remove blockquotes and avoid repetition
    for blockquote in soup.find_all('blockquote'):
        blockquote.decompose()
        
    # get clean text without html tags
    clean_text = soup.get_text()
    
    result = pd.DataFrame(columns = salaries_df.columns)
    
    
    new_row = {'name': post['author'], 'salary': 0, 'url': post['url']}
    found = False
    
    for line in clean_text.split('\n'):
        for keyword in salary_keywords:
            # check if salary is present
            if keyword in line:
                
                # regexp rules
                full_salary_match = re.search(r'\d+', line)
                short_salary_match = re.search(r'(\d+[.,]?\d*)\s*k', line)
                
                
                # find salary and convert it to int
                if(full_salary_match):
                    new_row['salary'] = int(full_salary_match.group(0))
                    found = True
                    
                elif (short_salary_match):
                    value = short_salary_match.group(0)
                
                    # convert , to . for Python float format
                    if "," in value:
                        value = value.replace(',', '.')
                        
                    # convert to full value, e.g. 6,5k -> 6500
                    if "k" in value:
                        value = value.replace('k', "")
                        value = float(value) * 1000
                        value = int(value)
                    
                    
                    new_row['salary'] = value
                    found = True
                    
                # search for currency of salary
                for currency in currency_keywords:
                    if currency in line:
                        new_row['currency'] = currency
                        found = True
                
    if(found):
        result.loc[len(salaries_df)] = new_row
                    
    return result
    

# iterate over all posts
# analyze every post
for index, post in posts_df.iterrows():
    
    post_salaries = extract_salaries(post)
    
    salaries_df = pd.concat([salaries_df, post_salaries], ignore_index = True)
    
print('Processed posts: ', len(salaries_df))
HTML(salaries_df.to_html(render_links=True, escape=False))

Processed posts:  56


Unnamed: 0,name,salary,currency,date,period,url
0,zarobk,6000.0,,,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1031091#id1031091
1,sssss,5500.0,,,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1031101#id1031101
2,franck,5000.0,,,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1031459#id1031459
3,Hejmdndn,8000.0,,,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1031465#id1031465
4,piotrp,350.0,,,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1031900#id1031900
5,KtośZKrakowa,3300.0,,,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1032027#id1032027
6,_noname_,5700.0,zł,,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1032036#id1032036
7,ne0,9000.0,,,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1035672#id1035672
8,kdkrk,12000.0,,,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1035685#id1035685
9,post_inkognito,5900.0,,,,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1031491#id1031491
