In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import re
import datetime
from IPython.display import HTML

# Read the file
file = pd.read_json('../raw_data.json')
series = pd.Series(file['data'])

# convert json to dataframe
# the object contains all posts data
posts_df = pd.json_normalize(series)

# for development, obtain only first 100 results, for better performance
# posts_df = posts_df[:100]

print("Number of posts: ", len(posts_df))

Number of posts:  3761


In [48]:
# Extract the actual data and save it to salaries_df DataFrame
# salaries DataFrame containing extracted data
salaries_df = pd.DataFrame({'name': [], 'salary': [], 'currency': [], 'exp': [], 'date': [], 'period': [], 'url': []})

# keywords, which we search in the post content
salary_keywords = ['Zarobki', 'zarobki', 'ZAROBKI']

currencies ={
    'PLN' : ['ZŁ','ZL', 'Zł', 'zł', 'zl', 'złotych', 'zlotych'],
    'USD' : ['USD','Usd', 'usd', '$', 'DOLARÓW', 'DOLAROW', 'Dolarów', 'Dolarow', 'dolarów', 'dolarow'],
    'EUR' : ['EURO', 'Euro', 'euro', 'EUR', 'Eur', 'eur', '€']
}

thousands_keywords = ['k', 'tys', 'tyś', 'tysięcy', 'tysiecy', 'tysiące', 'tysiace']
hourly_rate_keywords = ['/H', '/h','/ H', '/ h', 'godz' 'GODZINA', 'Godzina', \
                         'godzina', 'GODZINE', "Godzine", 'godzine', 'GODZINĘ', "Godzinę", 'godzinę']
man_day_keywords = ['dzień', 'dzien', 'dziennie', 'MD', 'dniowka', 'dniówki']
b2b_keywords = ['b2b', 'B2B', 'B2b', 'b2B']
exp_years_keywords = ['DOSWIADCZENIE', 'Doswiadczenie', 'doswiadczenie', 'DOŚWIADCZENIE', 'Doświadczenie', 'doświadczenie']
years_keywords = ['ROK', 'Rok', 'rok', 'LAT', 'Lat', 'lat']
months_keywords = ['MIESIAC', 'Miesiac', 'Miesiąc', 'miesiąc', 'MIESIĘCY', 'Miesięcy','miesięcy','miesiac', 'miesiecy', 'mies']


def extract_salaries(post):
    '''
    Find salaries info in the post
    Args: post, row from posts_df DataFrame
    Returns: new DF object containing extracted data
    '''
    
    soup = BeautifulSoup(post['post_content'], 'html.parser')
    
    # remove blockquotes and tables and avoid repetition
    for blockquote in soup.find_all('blockquote'):
        blockquote.decompose()
        
    for table in soup.find_all('table'):
        table.decompose()
        
    # get clean text without html tags
    clean_text = soup.get_text()
    
    # returned result
    result = pd.DataFrame(columns = salaries_df.columns)
    
    # row to append to result
    new_row = {'name': post['author'], 'salary': 0, 'currency': None, 'exp': None, 'date': '' , 'period': 'monthly', 'url': post['url']}

    match_found = False

    
    #extract salaries
    for line in clean_text.split('\n'):

        # extract experience

        exp_found = False
        for token in exp_years_keywords:
            if exp_found:
                break
            if token in line:
                exp_matches = re.findall('\d+(?:[.,]\d+)?', line)
                if exp_matches:
                    exp = exp_matches[0]
                    new_row['exp'] = exp
                    exp_found = True
            if new_row['exp'] == None:
                new_row['exp'] = ''
            
                

        # ignore b2b tokens
        for token in b2b_keywords:
            if token in line:
                line = line.replace(token, "")

        for keyword in salary_keywords:
            # check if salary is present
            if keyword in line:
                
                # regexp rules
                full_salary_match = re.search(r'\d+', line)
                short_salary_match = re.search(r'(\d+[.,]?\d*)\s*(' + '|'.join(thousands_keywords) + ')', line)
                
                
                # find salary and convert it to int
                if(full_salary_match):
                    new_row['salary'] = int(full_salary_match.group(0))
                    match_found = True
                    
                elif (short_salary_match):
                    value = short_salary_match.group(0)
                
                    # convert , to . for Python float format
                    if "," in value:
                        value = value.replace(',', '.')
                        
                    # convert to full value, e.g. 6,5k -> 6500
                    for keyword in thousands_keywords:
                        if keyword in line:
                            value = re.findall('\d+([\.,]\d+)?', line)
                            value = float(value) * 1000
                            value = int(value)
                    
                    new_row['salary'] = value
                    match_found = True

                # distinct hourly rates
                for keyword in hourly_rate_keywords:
                    if keyword in line:
                        new_row['period'] = 'hourly'
                
                # distinct Man Day rates
                for keyword in man_day_keywords:
                    if keyword in line:
                        new_row['period'] = 'MD'
                    
                # search for currency of salary
                currency_match_found = False
                for currency, currencies_list in currencies.items():
                    #stop searching if match is already found
                    if currency_match_found:
                        break
                    for c in currencies_list:
                        if c in line:
                            new_row['currency'] = currency
                            currency_match_found = True
                            match_found = True
                
                # set default currency to PLN and mark it as (PLN)
                # to distinguish alleged values
                if new_row['currency'] == None:
                    new_row['currency'] = "(PLN)"

                # convert datetime
                try:
                    new_row['date'] = pd.to_datetime(post['date'])
                except:
                    pass
                
    if match_found:
        result.loc[len(salaries_df)] = new_row
                    
    return result
    

# iterate over all posts
# analyze every post
for index, post in posts_df.iterrows():
    
    post_salaries = extract_salaries(post)
    
    salaries_df = pd.concat([salaries_df, post_salaries], ignore_index = True)

print('Processed posts: ', len(salaries_df))

Processed posts:  2454


In [49]:

output_df = salaries_df[:100]

# output_df = salaries_df[salaries_df['period'] == 'MD']


# Display only foreign currencies
# output_df = salaries_df.loc[~salaries_df['currency'].isin(['PLN', '(PLN)'])]

# display only first 10 and last 10 items 
# output_df=pd.concat([output_df[:10], output_df[-10:]])

# output_df = output_df[output_df['period'] == 'hourly']
# output_df1 = output_df.loc[(output_df['period'] == 'hourly') & (output_df['salary'] > 1000)]

# print('procentowo większych niż 1000:', len(output_df1)/len(output_df))

HTML(output_df.to_html(render_links=True, escape=False))

Unnamed: 0,name,salary,currency,exp,date,period,url
0,zarobk,6000.0,(PLN),4.0,2014-05-05 10:09:00,monthly,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1031091#id1031091
1,sssss,5500.0,(PLN),3.0,2014-05-05 10:30:00,monthly,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1031101#id1031101
2,franck,5000.0,(PLN),2.0,2014-05-06 18:16:00,monthly,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1031459#id1031459
3,Hejmdndn,8000.0,(PLN),,2014-05-06 18:52:00,monthly,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1031465#id1031465
4,anonim_wrocław,6000.0,(PLN),15.0,2014-05-11 12:50:00,monthly,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1032680#id1032680
5,piotrp,350.0,(PLN),6.0,2014-05-08 11:20:00,MD,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1031900#id1031900
6,majk88,1800.0,(PLN),1.0,2014-05-11 16:27:00,monthly,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1032728#id1032728
7,r,3900.0,(PLN),15.0,2014-05-11 16:44:00,monthly,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1032733#id1032733
8,junior java dev,4000.0,(PLN),5.0,2014-05-11 17:00:00,monthly,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1032740#id1032740
9,CppWroc,2700.0,PLN,10.0,2014-05-11 18:28:00,monthly,https://4programmers.net/Forum/Kariera/233131-ile_zarabiacie?p=1032756#id1032756
