In [2]:
import numpy as np
import pandas as pd
import re
import csv
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from collections import defaultdict, Counter

### Read CSV

In [3]:
df = pd.read_csv("job2.csv", sep='\s*;\s*', engine='python')
df.head(2)

Unnamed: 0,job_title,company,company_size,industry,required_experience,posting_date,closing_date,link,job_description
0,Data Scientist / Analyst ( UP $7K/ JURONG/ PhD...,Search Index Pte Ltd \t\t\t\t\t\t\t \t\...,1 - 50 Employees,Human Resources Management/Consulting,Min 2 years (Manager),04-July-2018,03-August-2018,https://www.jobstreet.com.sg/en/job/data-scien...,UP $7K / MNCÂ Jurong AreaÂ AWS + BONUSÂ PHD / ...
1,"VP/AVP, Data Scientist, Big Data Analytics Gro...",United Overseas Bank Limited (UOB),More than 5000 Employees,Banking/Financial Services,Min 5 years (Senior Executive),04-July-2018,03-August-2018,https://www.jobstreet.com.sg/en/job/vpavp-data...,TheÂ Data Management Office (DMO)Â is a busine...


As we can see in the column "company", there are some unwanted \t. and in the "job_description" column, there are some weird characters. The next step is to remove these unwanted whitespaces and characters.

In [4]:
for column in df.columns:
    df[column] = df[column].str.replace("\t","")

In [5]:
df["cleaned_des"] = df.job_description.str.replace('[^ a-zA-Z0-9/*",.]+', ' ').str.replace(' +', ' ')
df.head()

Unnamed: 0,job_title,company,company_size,industry,required_experience,posting_date,closing_date,link,job_description,cleaned_des
0,Data Scientist / Analyst ( UP $7K/ JURONG/ PhD...,Search Index Pte Ltd (Recruitment Firm),1 - 50 Employees,Human Resources Management/Consulting,Min 2 years (Manager),04-July-2018,03-August-2018,https://www.jobstreet.com.sg/en/job/data-scien...,UP $7K / MNCÂ Jurong AreaÂ AWS + BONUSÂ PHD / ...,UP 7K / MNC Jurong Area AWS BONUS PHD / Master...
1,"VP/AVP, Data Scientist, Big Data Analytics Gro...",United Overseas Bank Limited (UOB),More than 5000 Employees,Banking/Financial Services,Min 5 years (Senior Executive),04-July-2018,03-August-2018,https://www.jobstreet.com.sg/en/job/vpavp-data...,TheÂ Data Management Office (DMO)Â is a busine...,The Data Management Office DMO is a business f...
2,(Master in Com Science/No Exp Ok!) Data Scient...,Achieve Career Consultant Pte Ltd(Recruitment ...,51 - 200 Employees,Human Resources Management/Consulting,Min 2 years (Junior Executive),03-July-2018,02-August-2018,https://www.jobstreet.com.sg/en/job/master-in-...,"S$6,000 to S$8,500 + AWS + Performance BonusGl...","S 6,000 to S 8,500 AWS Performance BonusGlobal..."
3,Data Scientist,Capita Pte Ltd(Recruitment Firm),51 - 200 Employees,Human Resources Management/Consulting,Min 5 years (Senior Executive),03-July-2018,02-August-2018,https://www.jobstreet.com.sg/en/job/data-scien...,URGENT! Data ScientistÂ Â Â Job DescriptionThe...,URGENT Data Scientist Job DescriptionThe RoleA...
4,(Analysis) Data Scientist - (Listed IT MNC/$6K...,Achieve Career Consultant Pte Ltd(Recruitment ...,51 - 200 Employees,Human Resources Management/Consulting,Min 2 years (Junior Executive),03-July-2018,02-August-2018,https://www.jobstreet.com.sg/en/job/analysis-d...,"S$6,000 to S$8,500 + AWS + Performance BonusGl...","S 6,000 to S 8,500 AWS Performance BonusGlobal..."


I want to analyze the required qualifications in the job descriptions
* Find out how many times each of the skills appears in each of the job descriptions
* Find out how many job listings require each of the skills (multiple appearances in the same job listing is counted as one)
* Required years of experience, check whether the required experience matches the description in the job description.
* Levels (entry, junior, senior, manager)

It looks like Achieve Career Consultant has a lot of listings, there might be duplicated listings. It could be interesting to explore their listings seperately.

In [123]:
users_interests = {"hadoop", "big data", "java", "spark", "nosql", "mongodb", "postgres", "python", "scikit-learn", "numpy", "statsmodels", "pandas", "r", "python", "statistics", "probability", "machine learning", "c++", "neural networks", "deep learning", "big data", "artificial intelligence", "mapreduce", "databases", "mysql"}


In [126]:
users_interests.update(["pytorch","sklearn","seaborn","matlab","sas","aws","azure", "matplotlib","sql", "database","tensorflow","caffee"])


In [127]:
users_interests

{'artificial intelligence',
 'aws',
 'azure',
 'big data',
 'c++',
 'caffee',
 'database',
 'databases',
 'deep learning',
 'hadoop',
 'java',
 'machine learning',
 'mapreduce',
 'matlab',
 'matplotlib',
 'mongodb',
 'mysql',
 'neural networks',
 'nosql',
 'numpy',
 'pandas',
 'postgres',
 'probability',
 'python',
 'pytorch',
 'r',
 'sas',
 'scikit-learn',
 'seaborn',
 'sklearn',
 'spark',
 'sql',
 'statistics',
 'statsmodels',
 'tensorflow'}

In [128]:
df["des_counter"] = df["cleaned_des"].str.lower().str.split().apply(Counter)

In [129]:
df["des_counter"] = df["des_counter"].apply(lambda x: sorted(x.items(), key=lambda x: (-x[1], x[0])))

In [130]:
df["keywords"] = [[] for _ in range(len(df))]


If a kw appears in the description, add the kw into the keywords column.

In [131]:
for idx, text in df["cleaned_des"].iteritems():
    for item in users_interests:
        if item in text.lower():
            df["keywords"][idx].append(item)


In [132]:
df[["required_experience", "keywords"]].head()

Unnamed: 0,required_experience,keywords
0,Min 2 years (Manager),"[aws, r]"
1,Min 5 years (Senior Executive),"[big data, spark, hadoop, sql, machine learnin..."
2,Min 2 years (Junior Executive),"[java, big data, spark, aws, mapreduce, hadoop..."
3,Min 5 years (Senior Executive),"[hadoop, python, r]"
4,Min 2 years (Junior Executive),"[java, big data, spark, aws, mapreduce, hadoop..."


In [133]:
kw_list = []
for item in df["keywords"]:
    kw_list = kw_list + item


In [134]:
WordCount = Counter(kw_list)

In [135]:
WordCount.most_common(200)

[('r', 58),
 ('python', 52),
 ('machine learning', 49),
 ('statistics', 40),
 ('hadoop', 36),
 ('big data', 34),
 ('spark', 28),
 ('sql', 24),
 ('java', 24),
 ('deep learning', 23),
 ('database', 17),
 ('aws', 12),
 ('mapreduce', 12),
 ('databases', 11),
 ('matlab', 8),
 ('sas', 7),
 ('nosql', 7),
 ('tensorflow', 6),
 ('artificial intelligence', 4),
 ('pytorch', 4),
 ('neural networks', 3),
 ('postgres', 2),
 ('matplotlib', 2),
 ('mysql', 2),
 ('azure', 1),
 ('mongodb', 1),
 ('numpy', 1),
 ('seaborn', 1),
 ('pandas', 1),
 ('sklearn', 1)]

In [96]:
for word, count in WordCount.most_common(200):
    print(word, count)

r 58
python 52
machine learning 49
statistics 40
hadoop 36
big data 34
spark 28
java 24
deep learning 23
aws 12
mapreduce 12
databases 11
matlab 8
sas 7
nosql 7
artificial intelligence 4
pytorch 4
neural networks 3
postgres 2
matplotlib 2
mysql 2
azure 1
mongodb 1
numpy 1
seaborn 1
pandas 1
sklearn 1
