# Feature Engineer

In [15]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import nltk

In [42]:
jobs = pd.read_csv('../data/job_postings.csv')
jobs = jobs.drop(columns=['date_added', 'organization', 'skills_len', 'job_type', 'location'])
jobs.fillna('', inplace=True)

In [43]:
jobs

Unnamed: 0,job_description,job_title,skills
0,n edi analyst with experience please read on ...,Analyst,edi trustedlink as van
1,informatica etl developerst petersburg fl only...,Developer,etl informatica b data exchange netezza oracle...
2,this nationally recognized microsoft gold part...,Manager,microsoft dynamics ax project manager - toront...
3,.net developer with experience please read on...,Developer,c asp.net sql javascript mvc
4,hatstand a global financial consultancy is see...,Developer,java linux unix sdlc; multi-threaded or concur...
...,...,...,...
16427,jpmorgan chase co. (nyse: jpm) is a leading g...,Developer,.net architecture developer development git ht...
16428,seeking jr. systems administrators with experi...,Administrator,jr. linux administrator
16429,senior lead devops engineer with a desired to...,Developer,amazon web services linux bash ruby python agile
16430,headquartered in downtown san francisco ca we ...,Developer,javascript react.js golang startup ror iot ana...


---

# Remove words with low counts

In [5]:
# Look at both job_description and skills columns
jobs['text'] = jobs['job_description'] + ' ' + jobs['skills']

## Using nltk

In [6]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(len(stop_words))

179


In [8]:
text

0         an EDI Analyst with experience please read on...
1        Informatica ETL DeveloperSt Petersburg FL Only...
2        This nationally recognized Microsoft Gold Part...
3         a .NET Developer with experience please read ...
4        Hatstand a global financial consultancy is see...
                               ...                        
16431      JPMorgan Chase & Co. (NYSE: JPM) is a leadin...
16432    Seeking Jr. Systems Administrators with experi...
16433     a Senior Lead Devops Engineer with a desired ...
16434    Headquartered in downtown San Francisco CA we ...
16435     an AppSec or Hardware Security Engineer who h...
Name: text, Length: 16436, dtype: object

In [13]:
from nltk.tokenize import word_tokenize
tokens = jobs['text'].map(word_tokenize)

In [14]:
jobs['text'].head(5).str.lower()

0     an edi analyst with experience please read on...
1    informatica etl developerst petersburg fl only...
2    this nationally recognized microsoft gold part...
3     a .net developer with experience please read ...
4    hatstand a global financial consultancy is see...
Name: text, dtype: object

In [None]:
# convert to lower case
tokens = jobs['text'].str.lower()

In [None]:
# split into words
from nltk.tokenize import word_tokenize
tokens = jobs['text'].map(word_tokenize)

# convert to lower case
tokens = [w.lower() for w in tokens]

# remove punctuation from each word
import string
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]

# remove remaining tokens that are not alphabetic
words = [word for word in stripped if word.isalpha()]

# filter out stop words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
print(words[:100])

## Using countvectorizer

In [193]:
# Vectorize the training data
cvec = CountVectorizer(stop_words='english')

cvec_matrix = cvec.fit_transform(jobs['text'])

cvec_matrix.shape

(16428, 88015)

86,767 words with no min_df  
9943 words with min_df = 10

In [162]:
df = pd.DataFrame(cvec_matrix.todense(), columns=cvec.get_feature_names())

In [163]:
counts = [word for word in df if df[word].sum() < 10]

In [165]:
words_to_drop = counts

In [166]:
len(words_to_drop)

75747

In [169]:
from sklearn.feature_extraction import text 
len(text.ENGLISH_STOP_WORDS)

318

In [170]:
to_drop = text.ENGLISH_STOP_WORDS.union(words_to_drop)
len(to_drop)

76065

In [200]:
jobs['text'][0]

" an EDI Analyst with experience, please read on We are a strong, long standing company looking for an EDI Analyst for our team. You must have + years of EDI experience in a TrustedLink for i environment. Your role will work with our finance department identifying trading partners, work closely with external customers and be the EDI liaison across the company. You will also monitor AS and VAN communications, correct errors and incoming data.What You Need for this PositionRequirements: + years of EDI experience + years of TrustedLink for i Experience, iSeries AS Experience with VAN and AS communicationsWhat's In It for YouWe offer a strong compensation package and benefits Local Candidates ONLY please.So, n EDI Analyst with experience, please apply today Applicants must be authorized to work in the U.S.Please apply directly to by clicking 'Click Here to Apply' with your Word resume Looking forward to receiving your resume and going over the position in more detail with you.- Not a fit f

In [207]:
test = jobs['text'][0]

for word in to_drop:
    pattern = f'\s{word}\s'
    
    if  in jobs['text'][0]:
        test = test.replace(word, ' ')

In [208]:
test

"   EDI An l t   ex  ce,     d   We        , l  st d g  mp y  ok g     EDI An l t     t . You m t   + y    EDI ex  ce     Tr tedL k       n nt. Y  r e   w      f  ce         y g   ng p   , w   sely   e e l c   rs  d   t  EDI l       s t   mp y. You    o m    AS  d VAN  m    ,     e    d   g d t .Wh t You N    t  Pos  R u    : + y    EDI ex  ce + y    Tr tedL k     Ex  ce,  Ser s AS Ex  ce   VAN  d AS  m    Wh t's In It   YouWe  r       mp    p   ge  d b f s L l C d d tes ONLY  .So, n EDI An l t   ex  ce,    p  t  y Appl c ts m t    d   w    t  U.S.Ple se  p    ly     cl   g 'Cl   He    Ap '   y  W  r   L  g fo rd     v g y  r    d g ng  er t        e   l    .- Not   f    t     Cl   t  l k  t t      t  em  l   se  h  ll      n   .L  g fo rd     v g y  r   Cy rC rsCy rC rs, Inc   p         E  l Oppo   y E o rAll   l  d  ppl c ts     ve         y nt w  ut   rd   r ce, c  ,    , sex, n    n, d  b  , p  ted v r  st t ,    y ot r c      p  ted   l w.Y  R     W  - In    ce   fe r l l w,  ll p

In [135]:
# Lots of 'aa'
df['aa'].sum()

444