# Slovak Jobs Data Preparation pipeline

**`Table of contents:`**
- <a href = '#intro'>Short Introduction</a>
- <a href = '#import'>Data import</a>
- <a href = '#cr_imp'>Creating or importing functions
     1. <a href = '#rewr_v'>Rewriting missing values</a>
     2. <a href = '#salary'>Creating float type salaries</a>
     3. <a href = '#loc_job'>Working with job location</a>
     4. <a href = ''></a>

<a id = 'intro'></a>

## Short Introduction

This is ***the third part*** of my project, where I analyze **Slovak jobs market**. <br>
In this part I will be assembling the `data preparation pipeline` for raw, scrapped data from the website creating function from the algorithms I used in ***the second part***.

<a id = 'import'></a>

## Data import

In [1]:
import numpy as np
import pandas as pd
data = pd.read_csv('Scrapper/prsk_jobs.csv', sep = ',')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133169 entries, 0 to 133168
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Job                133169 non-null  object
 1   Employer           133169 non-null  object
 2   Location           133138 non-null  object
 3   Salary             133169 non-null  object
 4   WO_CV              133169 non-null  int64 
 5   Paid_ride_to_work  133169 non-null  int64 
 6   Housing            133169 non-null  int64 
 7   Available_for_UKR  133169 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 8.1+ MB


In [2]:
sk_cities = pd.read_csv('Cities/sk.csv', sep = ',')
hu_cities = pd.read_csv('Cities/hu.csv', sep = ',')
cz_cities = pd.read_csv('Cities/cz.csv', sep = ',')
pl_cities = pd.read_csv('Cities/pl.csv', sep = ',')
at_cities = pd.read_csv('Cities/at.csv', sep = ',')

countries = [sk_cities, hu_cities, cz_cities, pl_cities, at_cities]

<a id = 'cr_imp'></a>

## Creating or importing functions

<a id = 'rewr_v'></a>

### Rewriting missing values

In [3]:
#Getting str columns with 'None' in them
def get_cols_with_None(data_jobs):
    cols_with_None = []
    for col in data_jobs.select_dtypes(include = 'object').columns:
        None_count = (data_jobs[col] == 'None').sum()
        if None_count != 0:
            cols_with_None.append(col)
    return cols_with_None

In [4]:
#Changing None to float NaN
def None_to_NaN(data_jobs, cols_with_None):
    for col in cols_with_None:
        for ind in data_jobs.index:
            if data_jobs.loc[ind, col] == 'None':
                data_jobs.loc[ind, col] = float("NaN")
    return data_jobs

<a id = 'salary'></a>

### Creating float type salaries

In [5]:
#Define a function which extracts a currency from a typical salary string
#As it is known about the data, salary typically contains words 'from', 'month', 'hour',
# which also need to be excluded from the string
def get_currency(text):
    a = text
    for ch in '0123456789-. ':
        a = a.replace(ch, '')
    a = a.replace('From', '') #from
    a = a.replace('/', '')
    a = a.replace('month', '') #a month
    a = a.replace('hour', '') #an hour
    return a

In [6]:
#Conversion to EUR
# 1 Kč = 0.041 EUR
# 1 Ft = 0.0026 EUR
def conversion_toEUR(money, currency):
    if currency == 'Kč':
        return round(0.041 * money, 2)
    elif currency == 'Ft':
        return round(0.0026 * money, 2)
    elif currency == 'EUR':
        return money
    else:
        raise TypeError('Currency does not correspond to conversion function. \
                    Either update the function or change currency manually. ')

In [7]:
#The average hours people spend working is 173.33 hrs/month
def conversion_toMonthly(money_hourly):
    return round(money_hourly * 173.33, 2)

In [8]:
#a function, which will take the string salary
# and transform it into a list, which contains starting and max salary
# (if the salary is fixed - starting and max salaries are the same)

def get_salary(text):
    
    #checking for NaN
    if pd.isnull(text):
        #if it is NaN - return list of NaNs
        return [float('NaN'), float('NaN')]
    
    #creating initial variables
    a = text
    currency = get_currency(a)
    is_hourly = False
    
    #deleting unnecesarry characters and words
    a = a.replace('From', '') #from
    a = a.replace('/', '')
    a = a.replace(' ', '')
    a = a.replace(currency, '')
    a = a.replace('month', '') #a month
    
    #if hod is in the instance - that means the wage is hourly
    if 'hour' in text:
        is_hourly = True
        #getting rid of the word hod/hod.
        a = a.replace('hour', '') #an hour
    
    #splitting numbers
    a = a.split('-')
    
    #if there are two numbers
    if len(a) == 2:
        
        #if the wage is hourly
        if is_hourly:
            #convert every salary to monthly salary
            start_salary = conversion_toMonthly(float(a[0]))
            end_salary = conversion_toMonthly(float(a[1]))
        
        else:
            #else write as it is
            start_salary = float(a[0])
            end_salary = float(a[1])
            
    #if there is only one number        
    else:
        #if hourly
        if is_hourly:
            #convert to monthly salary and write equal salaries
            start_salary = end_salary = conversion_toMonthly(float(a[0]))
        else:
            #just write equal salaries
            start_salary = end_salary = float(a[0])
    
    #Converting salaries to EUR and returning them
    start_salary = conversion_toEUR(start_salary, currency)
    end_salary = conversion_toEUR(end_salary, currency)
    return [start_salary, end_salary]

In [9]:
def get_salaries(data_jobs):
    #creating initial float arrays
    start_salaries = np.zeros(len(data_jobs), dtype = 'float64')
    end_salaries = np.zeros(len(data_jobs), dtype = 'float64')

    #writing salaries to arrays accordingly
    for ind, instance in zip(data_jobs['Salary'].index, data_jobs['Salary']):
        start_salary, end_salary = get_salary(instance)
        start_salaries[ind] += start_salary 
        end_salaries[ind] += end_salary
    return [start_salaries, end_salaries]

In [10]:
#Inserting new columns for salary
def new_salary_cols(data_jobs, salaries):
    #inserting Start and End Salary columns with float values
    index_Salary = data_jobs.columns.get_loc("Salary")
    data_jobs.insert(index_Salary+1, 'Start_salary_EUR_mon', salaries[0])
    data_jobs.insert(index_Salary+2, 'End_salary_EUR_mon', salaries[1])

    #Dropping initial Salary column
    data_jobs.drop(labels = 'Salary', axis = 1, inplace = True)
    return data_jobs

<a id = 'loc_job'></a>

### Working with job location

In [11]:
#adding new columns for location
def add_cols_job_loc(data_jobs):
    index_jobloc = data_jobs.columns.get_loc('Location')
    data_jobs.insert(index_jobloc+1, 'fully_remote', 0)
    data_jobs.insert(index_jobloc+2, 'half_remote', 0)
    data_jobs.insert(index_jobloc+3, 'Country', '')
    data_jobs.insert(index_jobloc+4, 'City', '')
    return data_jobs

In [12]:
#processing by city
def process_loc_by_city(data_jobs, countries):
    #Add a space in the end in order to compare full city names
    data_jobs.loc[:, 'Location'] += ' '

    #creating a special loop that will check the location string
    # and modify the data in the previously created columns

    #In order to achieve efectiveness, and avoid duplicate
    # calculations, we will work only with uniques
    for instance in data_jobs['Location'].unique():

        #skip every NaN value
        if pd.isnull(instance):
            continue

        #initialize bool variables for each instance
        skip_country = False

        for country in countries:

            #if the city was already found, skip all the other countries
            if skip_country:
                continue

            for city in country['city']:

                # if a city name is in the location string
                if ''.join([city.lower(), ' ']) in instance.lower():
                    skip_country = True #set skip_country to True to skip every other country
                    #getting the indices of instances with this location
                    for ind in data_jobs[data_jobs['Location'] == instance].index:
                        data_jobs.loc[ind, 'Country'] = country.loc[0, 'country'] #write the country
                        data_jobs.loc[ind, 'City'] = city #write the city
                    break

        # if there is a part that tells about occasional work from home
        if 'občasnú prácu z domu' in instance \
        or 'home office' in instance:
            for ind in data_jobs[data_jobs['Location'] == instance].index:
                data_jobs.loc[ind, 'half_remote'] = 1 #set half remote to 1

        #Else - check for full remote
        elif 'Remote' in instance or 'z domu' in instance:
            for ind in data_jobs[data_jobs['Location'] == instance].index:
                data_jobs.loc[ind, 'fully_remote'] = 1
                data_jobs.loc[ind, 'Country'] = float('NaN')
                data_jobs.loc[ind, 'City'] = float('NaN')
    return data_jobs

In [13]:
#a function which process location by district
def process_loc_by_district(data_jobs, countries):
    no_city = data_jobs[data_jobs['City']=='']
    #using function to process data by administrative name
    for instance in no_city["Location"].unique():
        #If NaN - go to the next iteration
        if pd.isnull(instance):
            continue

        skip_country = False #initializing skip_country variable
        for country in countries:
            #if the country was found previously - then just skip it
            if skip_country:
                continue

            #a cycle for writing country and city by administrative name
            for r_ind, region in enumerate(country['admin_name']):
                #if the administrative name is in location
                if ''.join([region, ' ']) in instance:
                    #write the country and city name
                    for ind in data_jobs[data_jobs['Location']==instance].index:
                        data_jobs.loc[ind, 'City'] = country.loc[r_ind, 'city']
                        data_jobs.loc[ind, 'Country'] = country.loc[r_ind, 'country']
                    break
    return data_jobs

In [14]:
#a function which process location by country
def process_loc_by_country(data_jobs, countries):
    sk_cities, hu_cities, cz_cities, pl_cities, at_cities = countries
    #Creating lists of country names
    sk_countries_names = ['Slovensko', 'Česko', 'Rakusko', 'Poľsko', 'Maďarsko']
    countries_names = [sk_cities.loc[0, 'country'], cz_cities.loc[0,'country'], \
                       at_cities.loc[0, 'country'], pl_cities.loc[0, 'country'], \
                       hu_cities.loc[0, 'country']]

    #Creating the dictionary of "translation" to regular country names
    zip_iterator = zip(sk_countries_names, countries_names)
    sk_country_convdict = dict(zip_iterator)
    
    no_country = data_jobs[data_jobs['Country'] == '']
    for instance in no_country['Location'].unique():
        if pd.isnull(instance):
            continue

        skip_country = False

        for country_name in countries_names:
            if ''.join([country_name.lower(), ' ']) in instance.lower():
                for ind in data_jobs[data_jobs['Location']==instance].index:
                    data_jobs.loc[ind, 'Country'] = country_name
                    data_jobs.loc[ind, 'City'] = 'None'
                skip_country = True
                break

        for sk_country_name in sk_countries_names:
            if ''.join([sk_country_name.lower(), ' ']) in instance.lower():
                for ind in data_jobs[data_jobs['Location']==instance].index:
                    data_jobs.loc[ind, 'Country'] = sk_country_convdict[sk_country_name]
                    data_jobs.loc[ind, 'City'] = float('NaN')
                skip_country = True
                break
                
    return data_jobs

In [15]:
def fill_loc_NaN(data_jobs):
    no_country = data_jobs[data_jobs['Country'] == '']
    for ind in no_country.index:
        data_jobs.loc[ind, 'Country'] = float('NaN')
        data_jobs.loc[ind, 'City'] = float('NaN')
    return data_jobs

In [16]:
#dropping location column and duplicates
def final_step(data_jobs):
    data_jobs.drop(labels = 'Location', axis = 1, inplace = True)
    data_jobs.drop_duplicates(subset = data_jobs.columns,keep = 'first', inplace = True)
    return data_jobs

In [17]:
def save_data(data_jobs, path):
    data_jobs.to_csv(path, sep = ',', encoding = 'utf-8')

In [18]:
def prep_pipeline(df, all_countries):
    df.pipe((None_to_NaN, 'data_jobs'), cols_with_None = get_cols_with_None(df))\
      .pipe((new_salary_cols, 'data_jobs'), salaries = get_salaries(df))\
      .pipe((add_cols_job_loc, 'data_jobs'))\
      .pipe((process_loc_by_city, 'data_jobs'), countries = all_countries)\
      .pipe((process_loc_by_district, 'data_jobs'), countries = all_countries)\
      .pipe((process_loc_by_country, 'data_jobs'), countries = all_countries)\
      .pipe((fill_loc_NaN, 'data_jobs'))\
      .pipe((final_step, 'data_jobs'))\
      .pipe((save_data, 'data_jobs'), path = 'Prepared Data/prsk_jobs_pr.csv')
    return df

In [19]:
saved_data = prep_pipeline(data, countries)
saved_data.head(10)

Unnamed: 0,Job,Employer,fully_remote,half_remote,Country,City,Start_salary_EUR_mon,End_salary_EUR_mon,WO_CV,Paid_ride_to_work,Housing,Available_for_UKR
0,.NET Programmer,Metrohm Research Slovakia s. r. o.,0,0,Slovakia,Petržalka,2500.0,2500.0,0,0,0,0
1,.NET Programmer,Metrohm Research Slovakia s. r. o.,0,0,Slovakia,Petržalka,2300.0,3800.0,0,0,0,0
2,.NET Programmer,CRIF - Slovak Credit Bureau s. r. o.,0,1,Czechia,Prague,2300.0,2300.0,0,0,0,0
3,.NET Programmer,develogics k. s.,0,1,Slovakia,Bratislava,2000.0,2000.0,0,0,0,0
4,.NET Programmer,Sygic a. s.,0,1,Slovakia,Bratislava,2000.0,4000.0,0,0,0,0
6,.NET Programmer,ESET spol. s r.o.,1,0,,,3200.0,3200.0,0,0,0,1
7,.NET Programmer,R-DAS s. r. o.,0,1,Slovakia,Bratislava,1200.0,2500.0,0,0,0,0
8,.NET Programmer,Takeda,0,1,Slovakia,Bratislava,1850.0,1850.0,0,0,0,1
9,.NET Programmer,KODYS SLOVENSKO s r.o.,1,0,,,1500.0,3000.0,0,0,0,1
10,.NET Programmer,Deutsche Telekom IT Solutions Slovakia,0,0,Slovakia,Košice,2000.0,2650.0,0,0,0,1
