# Data Parser for LinkedIn Job postings

In [30]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
import math

In [4]:
# Reading the postings.csv dataset

df = pd.read_csv("../data/postings.csv")
df.head()

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,views,med_salary,...,skills_desc,listed_time,posting_domain,sponsored,work_type,currency,compensation_type,normalized_salary,zip_code,fips
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,20.0,HOURLY,"Princeton, NJ",2774458.0,20.0,,...,Requirements: \n\nWe are seeking a College or ...,1713398000000.0,,0,FULL_TIME,USD,BASE_SALARY,38480.0,8540.0,34021.0
1,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",50.0,HOURLY,"Fort Collins, CO",,1.0,,...,,1712858000000.0,,0,FULL_TIME,USD,BASE_SALARY,83200.0,80521.0,8069.0
2,10998357,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,65000.0,YEARLY,"Cincinnati, OH",64896719.0,8.0,,...,We are currently accepting resumes for FOH - A...,1713278000000.0,,0,FULL_TIME,USD,BASE_SALARY,55000.0,45202.0,39061.0
3,23221523,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,175000.0,YEARLY,"New Hyde Park, NY",766262.0,16.0,,...,This position requires a baseline understandin...,1712896000000.0,,0,FULL_TIME,USD,BASE_SALARY,157500.0,11040.0,36059.0
4,35982263,,Service Technician,Looking for HVAC service tech with experience ...,80000.0,YEARLY,"Burlington, IA",,3.0,,...,,1713452000000.0,,0,FULL_TIME,USD,BASE_SALARY,70000.0,52601.0,19057.0


### Question 1: Are certain skills more in demand than others? If certain skills are closely related, should we put them together into a single resume? What are some skills that are closely related?

To address this question, we need to perform some semantic analysis of keywords in job postings. In order to do this, we would need to:
- Find a list of keywords in job postings. 
- Filter company_name, description, company_id, skills_desc

In [5]:
df.keys()

Index(['job_id', 'company_name', 'title', 'description', 'max_salary',
       'pay_period', 'location', 'company_id', 'views', 'med_salary',
       'min_salary', 'formatted_work_type', 'applies', 'original_listed_time',
       'remote_allowed', 'job_posting_url', 'application_url',
       'application_type', 'expiry', 'closed_time',
       'formatted_experience_level', 'skills_desc', 'listed_time',
       'posting_domain', 'sponsored', 'work_type', 'currency',
       'compensation_type', 'normalized_salary', 'zip_code', 'fips'],
      dtype='object')

In [11]:
# FIltering out the useful columns
filtered_columns  =['job_id', 'company_name', 'description', 'skills_desc', 'zip_code']
filtered_df = df[filtered_columns]
filtered_df.head()

Unnamed: 0,job_id,company_name,description,skills_desc,zip_code
0,921716,Corcoran Sawyer Smith,Job descriptionA leading real estate firm in N...,Requirements: \n\nWe are seeking a College or ...,8540.0
1,1829192,,"At Aspen Therapy and Wellness , we are committ...",,80521.0
2,10998357,The National Exemplar,The National Exemplar is accepting application...,We are currently accepting resumes for FOH - A...,45202.0
3,23221523,"Abrams Fensterman, LLP",Senior Associate Attorney - Elder Law / Trusts...,This position requires a baseline understandin...,11040.0
4,35982263,,Looking for HVAC service tech with experience ...,,52601.0


In [12]:
# Looking at the descriptions
print(filtered_df["description"][0])

Job descriptionA leading real estate firm in New Jersey is seeking an administrative Marketing Coordinator with some experience in graphic design. You will be working closely with our fun, kind, ambitious members of the sales team and our dynamic executive team on a daily basis. This is an opportunity to be part of a fast-growing, highly respected real estate brokerage with a reputation for exceptional marketing and extraordinary culture of cooperation and inclusion.Who you are:You must be a well-organized, creative, proactive, positive, and most importantly, kind-hearted person. Please, be responsible, respectful, and cool-under-pressure. Please, be proficient in Adobe Creative Cloud (Indesign, Illustrator, Photoshop) and Microsoft Office Suite. Above all, have fantastic taste and be a good-hearted, fun-loving person who loves working with people and is eager to learn.Role:Our office is a fast-paced environment. You’ll work directly with a Marketing team and communicate daily with oth

In [54]:
# Reading the skills.json file
with open("../skills_lowercase.json", "r") as f:
    skills_data = json.load(f)

general_skills = skills_data["general_skills"]
tech_skills = skills_data["tech_skills"]

print(type(tech_skills))
print(tech_skills)

<class 'list'>
['programming', 'software-development', 'cybersecurity', 'cloud-computing', 'data-analysis', 'machine-learning', 'devops', 'networking', 'database-management', 'automation', 'version-control', 'agile', 'testing', 'debugging', 'encryption', 'api-integration', 'virtualization', 'containerization', 'ux-design', 'artificial-intelligence', 'data-visualization', 'blockchain', 'web-development', 'mobile-development', 'microservices', 'algorithms', 'data-mining', 'big-data', 'distributed-systems', 'hardware', 'scripting', 'python', 'javascript', 'java', 'c++', 'c#', 'ruby', 'php', 'swift', 'go', 'kotlin', 'rust', 'r', 'sql', 'nosql', 'html', 'css', 'typescript', 'bash', 'perl', 'react', 'angular', 'vue', 'django', 'flask', 'spring', 'ruby-on-rails', 'node.js', 'express', 'asp.net', 'laravel', 'bootstrap', 'tensorflow', 'pytorch', 'keras', 'hadoop', 'spark', 'kafka', 'terraform', 'kubernetes', 'docker', 'ansible', 'jenkins', 'gitlab-ci', 'next.js', 'nuxt.js', 'svelte']


In [55]:
# Creating a new column if any of the skills exist in the dataset
job_descriptions = df["description"]
skill_descriptions = df["skills_desc"]

assert(len(job_descriptions) == len(skill_descriptions))

# Creating a new array of skills
skills = []

for i in range(len(job_descriptions)):
    print(f"Completed {i}/{len(job_descriptions)} iterations. ")
    job_description = job_descriptions[i]
    skill_description = skill_descriptions[i]

    # Array for skills required on the job
    skills_required = []
    if type(job_description) == str:
        job_description = job_description.split()
        # Looping through job description
        for word in job_description:
            word = word.lower()
            if word in general_skills or word in tech_skills:
                skills_required.append(word)

    # Looping through skills description
    if type(skill_description) == str:
        skill_description = skill_description.split()
        for word in skill_description:
            word = word.lower()
            if word in general_skills or word in tech_skills:
                skills_required.append(word)
    print(skills_required)
    skills.append(skills_required)

# Adding the skills array as a new field of the original data frame
filtered_df["skills"] = skills
filtered_df.head()
    
    

Completed 0/123849 iterations. 
['planning']
Completed 1/123849 iterations. 
['planning', 'communication', 'interpersonal']
Completed 2/123849 iterations. 
['organization', 'teamwork', 'communication']
Completed 3/123849 iterations. 
['problem-solving', 'planning', 'communication', 'planning', 'problem-solving', 'communication', 'presentation']
Completed 4/123849 iterations. 
[]
Completed 5/123849 iterations. 
['planning', 'planning', 'planning', 'planning', 'planning', 'planning', 'communication', 'planning']
Completed 6/123849 iterations. 
[]
Completed 7/123849 iterations. 
['communication', 'leadership', 'communication']
Completed 8/123849 iterations. 
['writing']
Completed 9/123849 iterations. 
['leadership', 'planning', 'planning', 'networking']
Completed 10/123849 iterations. 
['communication', 'leadership', 'communication', 'presentation']
Completed 11/123849 iterations. 
['leadership', 'collaboration']
Completed 12/123849 iterations. 
['planning', 'organization', 'leadership', 

In [45]:
# New dataset with job descriptions
filtered_df.head()