## Cleaning raw computer jobs dataset

In [1]:
#importing libraries
import numpy as np
import pandas as pd
import seaborn as sns
import re
from datetime import date
from collections import Counter, OrderedDict
pd.options.mode.chained_assignment = None  # default='warn'

### Functions used in the project

In [2]:
#function for clearing strange ranges of values
def strange_range_func(x, l_range, h_range): 
    if x < l_range or x > h_range:
        return pd.NA
    else:
        return x
    
#function for clearing different unnecessary symbols
def strange_symbols_func(x, symbol_before, symbol_after): 
    if symbol_before in x:
        new_x = x.replace(symbol_before, symbol_after)
        return new_x
    else:
        return x
    
#function responsible for combining patterns
def combined_pattern_func(patterns_list):
    combined_pattern = "|".join(patterns_list) #version with capture groups combined_pattern = "(" + ")|(".join(patterns_list) + ")"
    return combined_pattern

### General clearing operations

In [3]:
#importing a raw dataset
df = pd.read_csv('uncleaned_computer_jobs_original.csv', encoding="latin-1")

#setting name as df index
df = df.set_index('index')

#renaming columns
df = df.rename(columns={"Job Title":"Job_Title",
                        "Salary Estimate":"Salary_Estimate_Thousands_of_USD",
                        "Job Description":"Job_Description",
                        "Company Name":"Company_Name",
                        "Founded":"Year_Founded",
                        "Size":"Number_of_Employees",
                        "Type of ownership":"Type_of_Ownership",
                        "Revenue":"Revenue_USD"})

#clearing rating column (rating on glassdoor site is from range 0 to 5)
df['Rating'] = df['Rating'].apply(strange_range_func, l_range = 0, h_range = 5)

#deleting second part of the company name, which is a rating after a new line symbol
n_line_comp_name_bmask = df['Company_Name'].str.contains('\n') #boolean array with a new line symbol
#splitting name at new line symbol and deleting last element 
df['Company_Name'].loc[n_line_comp_name_bmask] = df['Company_Name'].loc[n_line_comp_name_bmask].str.split('\n').str.get(0)

#clearing strange years of company foundation (1936 first computer was invented)

df['Year_Founded'] = df['Year_Founded'].apply(strange_range_func, l_range = 1936, h_range = date.today().year)

#changing '-1' industry, sector and competitors type as N/A
s_industry = df['Industry'] == '-1'
df['Industry'].loc[s_industry] = pd.NA
s_sector = df['Sector'] == '-1'
df['Sector'].loc[s_sector] = pd.NA
s_competitors = df['Competitors'] == '-1'
df['Competitors'].loc[s_competitors] = pd.NA

#clearing ownership types
t_ownership_1 = df['Type_of_Ownership'] == '-1'
df['Type_of_Ownership'].loc[t_ownership_1] = pd.NA
t_ownership_pp = df['Type_of_Ownership'] == 'Private Practice / Firm'
df['Type_of_Ownership'].loc[t_ownership_pp] = 'Company - Private'
t_ownership_u = df['Type_of_Ownership'] == 'Unknown'
df['Type_of_Ownership'].loc[t_ownership_u] = pd.NA
t_ownership_c = df['Type_of_Ownership'] == 'Contract'
df['Type_of_Ownership'].loc[t_ownership_c] = 'Self-employed'

#clearing revenue 
df['Revenue_USD'] = df['Revenue_USD'].apply(strange_symbols_func, symbol_before = '(USD)', symbol_after = "")
df['Revenue_USD'] = df['Revenue_USD'].apply(strange_symbols_func, symbol_before = '$', symbol_after = "")
df['Revenue_USD'] = df['Revenue_USD'].apply(strange_symbols_func, symbol_before = 'to', symbol_after = "-")
df['Revenue_USD'] = df['Revenue_USD'].apply(strange_symbols_func, symbol_before = ' ', symbol_after = "")
df['Revenue_USD'] = df['Revenue_USD'].apply(strange_symbols_func, symbol_before = 'million', symbol_after = '_000_000')
df['Revenue_USD'] = df['Revenue_USD'].apply(strange_symbols_func, symbol_before = 'billion', symbol_after = '_000_000_000')
revenue_unknown = df['Revenue_USD'] == 'Unknown/Non-Applicable'
df['Revenue_USD'].loc[revenue_unknown] = pd.NA
revenue_1 = df['Revenue_USD'] == '-1'
df['Revenue_USD'].loc[revenue_1] = pd.NA
revenue_less = df['Revenue_USD'] == 'Lessthan1_000_000'
df['Revenue_USD'].loc[revenue_less] = '1-1_000_000'
revenue_more = df['Revenue_USD'] == '10+_000_000_000'
df['Revenue_USD'].loc[revenue_more] = '10_000_000_000-<NA>'
df['Revenue_USD'] = df['Revenue_USD'].str.split('-')
df['Revenue_USD_Low'] = df['Revenue_USD'].str[0]
df['Revenue_USD_High'] = df['Revenue_USD'].str[-1]
revenue_unknown = df['Revenue_USD_High'] == '<NA>'
df['Revenue_USD_High'].loc[revenue_unknown] = pd.NA

#clearing number of employees
df['Number_of_Employees'] = df['Number_of_Employees'].apply(strange_symbols_func, symbol_before = ' to ', symbol_after = "-")
df['Number_of_Employees'] = df['Number_of_Employees'].apply(strange_symbols_func, symbol_before = ' employees', symbol_after = "")
employees_1 = df['Number_of_Employees'] == '-1'
df['Number_of_Employees'].loc[employees_1] = pd.NA
employees_unknown = df['Number_of_Employees'] == 'Unknown'
df['Number_of_Employees'].loc[employees_unknown] = pd.NA
df['Number_of_Employees'].loc[revenue_more] = '10000-<NA>'
df['Number_of_Employees'] = df['Number_of_Employees'].str.split('-')
df['Number_of_Employees_Low'] = df['Number_of_Employees'].str[0]
df['Number_of_Employees_High'] = df['Number_of_Employees'].str[-1]
employees_unknown = df['Number_of_Employees_High'] == '<NA>'
df['Number_of_Employees_High'].loc[employees_unknown] = pd.NA

#spliting headquarters location into city and state
headquarters_1 = df['Headquarters'] == '-1'
df['Headquarters'].loc[headquarters_1] = pd.NA
df['Headquarters'] = df['Headquarters'].str.split(',')
df['Headquarters_City'] = df['Headquarters'].str[0].str.title()
df['Headquarters_State/Country'] = df['Headquarters'].str[-1].str.strip()
headquarters_sc_061 = df['Headquarters_State/Country'] == '061'
df['Headquarters_State/Country'].loc[headquarters_sc_061] = pd.NA

#spliting location into city and state
location_1 = df['Location'] == '-1'
df['Location'].loc[location_1] = pd.NA
df['Location'] = df['Location'].str.split(',')
df['Location_City'] = df['Location'].str[0].str.title()
df['Location_State'] = df['Location'].str[-1].str.strip()
df['Location_State'].loc[df['Location_City'] == 'United States'] = pd.NA
df['Location_State'].loc[df['Location_City'] == 'Remote'] = pd.NA

#cleaning salary estimate
df['Salary_Estimate_Thousands_of_USD'] = df['Salary_Estimate_Thousands_of_USD'].str.findall(r'(\d+)')
df['Salary_Range_Low_Thousands_of_USD'] = df['Salary_Estimate_Thousands_of_USD'].str[0]
df['Salary_Range_High_Thousands_of_USD'] = df['Salary_Estimate_Thousands_of_USD'].str[1]

#clearing Job titles



#looking for required skills and qualifications - proposed list: Python, Excel, Cloud systems (AWS, GCP), SQL, Tableu, PowerBI, ML, PHD
#they want a guy who knows excel
excel_pattern = r'excel'
excel_mask = df['Job_Description'].str.contains(excel_pattern, flags = re.I)
df['Excel_Skills'] = 0
df['Excel_Skills'].loc[excel_mask] = 1

#they want a guy who knows Python
python_patterns = ['Python', 'Pandas', 'Numpy', 'Scikit', 'scipy']
python_combined_pattern = combined_pattern_func(python_patterns)
python_mask = df['Job_Description'].str.contains(python_combined_pattern, flags = re.I)
df['Python_Skills'] = 0
df['Python_Skills'].loc[python_mask] = 1

#they want a guy who knows Cloud Systems like AWS or GPC
cloud_patterns = ['AWS', 'Amazon Web Server', 'GCP', 'Google Cloud Platform']
cloud_combined_pattern = combined_pattern_func(cloud_patterns)
cloud_mask = df['Job_Description'].str.contains(cloud_combined_pattern, flags = re.I)
df['Cloud_System_Skills'] = 0
df['Cloud_System_Skills'].loc[cloud_mask] = 1

#they want a guy who knows SQL
sql_patterns = ['SQL', 'Structured Query Language']
sql_combined_pattern = combined_pattern_func(sql_patterns)
sql_mask = df['Job_Description'].str.contains(sql_combined_pattern, flags = re.I)
df['SQL_Skills'] = 0
df['SQL_Skills'].loc[sql_mask] = 1

#they want a guy who knows Tableau or Power BI
tableau_power_BI_patterns = ['Tableau', 'power BI']
tableau_power_BI_combined_pattern = combined_pattern_func(tableau_power_BI_patterns)
tableau_power_BI_mask = df['Job_Description'].str.contains(tableau_power_BI_combined_pattern, flags = re.I)
df['Tableau_Power_BI_Skills'] = 0
df['Tableau_Power_BI_Skills'].loc[cloud_mask] = 1

#they want a guy who is familiar with Machine Learning
ml_patterns = ['ML', 'machine learning', 'DL', 'deep learning', 'Scikit-learn', 'AI', 'Artfical Intelligence']
ml_combined_pattern = combined_pattern_func(ml_patterns)
ml_mask = df['Job_Description'].str.contains(ml_combined_pattern, flags = re.I)
df['ML_Skills'] = 0
df['ML_Skills'].loc[sql_mask] = 1

#they want a guy who has PHD
phd_patterns = [r'\bPHD\b']
phd_combined_pattern = combined_pattern_func(phd_patterns)
phd_mask = df['Job_Description'].str.contains(phd_combined_pattern, flags = re.I)
df['PhD'] = 0
df['PhD'].loc[sql_mask] = 1

#obtaining a sum of required skills for 7 defined categories
df['Skills_Sum'] = df['Excel_Skills'] + df['Python_Skills'] + df['Cloud_System_Skills'] + df['SQL_Skills'] + df['Tableau_Power_BI_Skills'] + df['ML_Skills'] + df['PhD'] 

#dropping redundant columns
df = df.drop(['Headquarters', 'Number_of_Employees', 'Revenue_USD', 'Location', 'Salary_Estimate_Thousands_of_USD', 'Job_Description'], axis=1)

OrderedDict(Counter(df['Job_Title']).most_common())
# df['Job_Title'].nunique()
df.head()
df.to_csv('cleaned_computer_jobs.csv')