# Exploring Data

In [1]:
import pandas as pd

In [2]:
g = open('data_postings.txt','r')
postings = list(map(lambda x:x[:-1],g.readlines()))
g.close()

g = open('data_labels.txt','r') # What we WANT to know!
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()

In [3]:
# data we will never use until the end of the project
post_valid, labels_valid = postings[10000:], labels[10000:]

# data that we will use
postings, labels = postings[:10000], labels[:10000]

In [4]:
len(post_valid), len(labels_valid), len(postings), len(labels)

(526, 526, 10000, 10000)

In [5]:
postings[0]

'"Bachelor’s degree, or equivalent experience, in Computer Science, Engineering, Mathematics or a related field5+ years’ experience of Data platform implementation, including 3+ years of hands-on experience in implementation and performance tuning Kinesis/Kafka/Spark/Storm implementations.Experience with analytic solutions applied to the Marketing or Risk needs of enterprisesBasic understanding of machine learning fundamentals.Ability to take Machine Learning models and implement them as part of data pipeline5+ years of IT platform implementation experience.Experience with one or more relevant tools ( Flink, Spark, Sqoop, Flume, Kafka, Amazon Kinesis ).Experience developing software code in one or more programming languages (Java, JavaScript, Python, etc).Current hands-on implementation experience requiredAt Amazon Web Services (AWS), we’re hiring highly technical cloud computing architects and engineers to collaborate with our customers and partners on key engagements. Our consultants

In [6]:
labels[0]

'DS'

In [7]:
def pretty_print_review_and_label(i):
    print(labels[i] + "\t:\t" + postings[i][:80] + "...")
    
print("labels \t: \tpostings\n")
pretty_print_review_and_label(0)
pretty_print_review_and_label(1)
pretty_print_review_and_label(2)

labels 	: 	postings

DS	:	"Bachelor’s degree, or equivalent experience, in Computer Science, Engineering, ...
SE	:	"Bachelor’s degree or higher in Computer Science, Computer Engineering, Electric...
SE	:	"Purpose of the Role:The mission for this role is to follow established best eng...


# Finding correlations and removing meaningless words

In [8]:
from collections import Counter
import numpy as np

In [9]:
se_counts = Counter()
ds_counts = Counter()
total_counts = Counter()

In [10]:
for posting in range(len(postings)):
    if labels[posting] == 'SE':
        for word in postings[posting].split(" "):
            se_counts[word] += 1
            total_counts[word] += 1
    else:
        for word in postings[posting].split(" "):
            ds_counts[word] += 1
            total_counts[word] += 1

In [11]:
# the counts of the most common words in se_counts
se_counts.most_common()[:10]

[('and', 146581),
 ('to', 79877),
 ('the', 64847),
 ('of', 61033),
 ('a', 46998),
 ('in', 45100),
 ('with', 40955),
 ('for', 29800),
 ('or', 25571),
 ('is', 21473)]

In [12]:
# the counts of the most common words in ds_counts
ds_counts.most_common()[:10]

[('and', 172711),
 ('to', 93667),
 ('the', 72643),
 ('of', 71550),
 ('in', 51420),
 ('a', 48498),
 ('with', 42729),
 ('data', 35041),
 ('for', 32343),
 ('or', 30099)]

In [13]:
se_ds_ratios = Counter()

for term, count in list(total_counts.most_common()):
    if count > 100:
        se_ds_ratio = se_counts[term] / float(ds_counts[term]+1)
        se_ds_ratios[term] = se_ds_ratio

In [14]:
print("se-to-ds ratio for 'and' = {}".format(se_ds_ratios["and"]))
print("se-to-ds ratio for 'to' = {}".format(se_ds_ratios["to"]))
print("se-to-ds ratio for 'analytical' = {}".format(se_ds_ratios["analytical"]))
print("se-to-ds ratio for 'Software' = {}".format(se_ds_ratios["Software"]))
print("se-to-ds ratio for 'Statistics' = {}".format(se_ds_ratios["Statistics"]))

se-to-ds ratio for 'and' = 0.8487018852193247
se-to-ds ratio for 'to' = 0.8527672203954392
se-to-ds ratio for 'analytical' = 0.213768115942029
se-to-ds ratio for 'Software' = 11.8188202247191
se-to-ds ratio for 'Statistics' = 0.009174311926605505


In [15]:
for word, ratio in list(se_ds_ratios.most_common()):
    if(ratio > 2):
        se_ds_ratios[word] = np.log(ratio)
    else:
        se_ds_ratios[word] = -np.log(1 / (ratio+0.01))

In [16]:
print("se-to-ds ratio for 'and' = {}".format(se_ds_ratios["and"]))
print("se-to-ds ratio for 'to' = {}".format(se_ds_ratios["to"]))
print("se-to-ds ratio for 'analytical' = {}".format(se_ds_ratios["analytical"]))
print("se-to-ds ratio for 'Software' = {}".format(se_ds_ratios["Software"]))
print("se-to-ds ratio for 'Statistics' = {}".format(se_ds_ratios["Statistics"]))

se-to-ds ratio for 'and' = -0.15233346587193464
se-to-ds ratio for 'to' = -0.14761035731282415
se-to-ds ratio for 'analytical' = -1.4971449600006292
se-to-ds ratio for 'Software' = 2.4696931952129306
se-to-ds ratio for 'Statistics' = -3.954183816252424


In [17]:
se_ds_ratios.most_common()[:30] # 30 most important words for SE

[('Tanzu', 5.25227342804663),
 ('Relic', 5.236441962829949),
 ('Cox', 5.017279836814924),
 ('Macy’s', 4.718498871295094),
 ('REST,', 4.363098624788363),
 ('observability', 4.02535169073515),
 ('.Net', 3.9186675481468147),
 ('Observability', 3.7534179752515073),
 ('Selenium', 3.506557897319982),
 ('.NET', 3.492169159867882),
 ('Android', 3.4790398684260895),
 ('test-driven', 3.3843902633457743),
 ('frontend', 3.332204510175204),
 ('TypeScript,', 3.3232358401924436),
 ('Node.js', 3.2088254890146994),
 ('Angular', 3.169685580677429),
 ('MVC,', 3.068052935133617),
 ('JavaScript', 3.057261463500853),
 ('Ruby', 3.0492730404820207),
 ('client-side', 2.9856819377004897),
 ('iOS', 2.9719216248602724),
 ('bug', 2.89591193827178),
 ('microservice', 2.820055259478705),
 ('CIS,', 2.816263785742443),
 ('C#', 2.739548868161581),
 ('React', 2.7365296724240786),
 ('HTML,', 2.719100037288795),
 ('pair', 2.719100037288795),
 ('JS', 2.70805020110221),
 ('Embedded', 2.633547673804289)]

In [18]:
list(reversed(se_ds_ratios.most_common()))[0:30] # 30 most important words for DS

[('AstraZeneca', -4.605170185988092),
 ('hypotheses', -4.605170185988092),
 ('descriptive', -4.605170185988092),
 ('Biostatistics,', -4.605170185988092),
 ('ADaM', -4.605170185988092),
 ('SDTM', -4.605170185988092),
 ('CDISC', -4.605170185988092),
 ('GSK', -4.605170185988092),
 ('Statistician', -4.605170185988092),
 ('Bayesian', -4.605170185988092),
 ('Scientist,', -4.605170185988092),
 ('Economics,', -4.356369901080518),
 ('Scientist"', -4.283444256995934),
 ('Scientist', -4.1339849679051675),
 ('statistical,', -4.109848748758066),
 ('"Data', -4.072059517432666),
 ('semester', -3.991359747664667),
 ('Statistical', -3.9806176556315447),
 ('SPSS,', -3.9753926193607354),
 ('sentiment', -3.958543021063039),
 ('Statistics', -3.954183816252424),
 ('Analyst"', -3.954183816252424),
 ('statistical', -3.8331932879237374),
 ('supervised', -3.7423380326387035),
 ('causal', -3.687376824229401),
 ('Statistics,', -3.646799368074074),
 ('Science"""', -3.619102576314776),
 ('unsupervised', -3.56317679

In [19]:
#! pip install nltk

import nltk
import re
from nltk.tokenize import word_tokenize
nltk.download('punkt')

def preprocess_data(data):
    """
    This function removes the noise from the job postings.
    
    INPUT: text data that has one posting per line
    OUTPUT: text data that has one posting per line
    """
    new_data = []
    for line in range(len(data)):
        word_list = ''
        for word in data[line].split(" "):
            if se_ds_ratios[word] > 1 or se_ds_ratios[word] < -1:
                word_list += word + ' '
        word_list = re.sub(r"[^a-zA-Z0-9]", " ", word_list.lower())
        word_list = list(set(word_tokenize(word_list)))
        sentence = " ".join(word_list)
        new_data.append(sentence)
    return new_data
        
new_data = preprocess_data(postings)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tigra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Final input

In [20]:
new_data[0]

'kafka professional consultants analytics engineer models risk scientists learning tensorflow javascript code pytorch ml java software emr specialist physics sets applied sql answer scalability engagements phd analytic data mathematics web deep machine spark marketing'

In [21]:
labels[0]

'DS'

In [22]:
new_data[1]

'models scientists c approaches iot learning algorithms mobile data ml novel environmental processing machine train embedded software engineer'

In [23]:
labels[1]

'SE'

# Building the model

In [24]:
post_train, labels_train = new_data[:7000], labels[:7000]
post_test, labels_test = new_data[7000:], labels[7000:]

In [25]:
len(post_train), len(post_test), len(post_valid)

(7000, 3000, 526)

In [26]:
len(labels_train), len(labels_test), len(labels_valid)

(7000, 3000, 526)

In [27]:
#Build a counter based on the training dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

counter = CountVectorizer()
counter.fit(new_data)

#count the number of times each term appears in a document and transform each doc into a count vector

counts_train = counter.transform(post_train)#transform the training data
counts_test = counter.transform(post_test)#transform the testing data

# Decision Tree Classifier

In [28]:
from sklearn.tree import DecisionTreeClassifier

#train classifier
clf = DecisionTreeClassifier()

#train classifier on the same datasets
clf.fit(counts_train,labels_train)

#use hard voting to predict (majority voting)
pred=clf.predict(counts_test)

#print accuracy
print (accuracy_score(pred,labels_test))

0.9453333333333334


# K Neighbors Classifier

In [29]:
from sklearn.neighbors import KNeighborsClassifier

#train classifier
KNN_classifier=KNeighborsClassifier()

#train classifier on the same datasets
KNN_classifier.fit(counts_train,labels_train)

#use hard voting to predict (majority voting)
pred=KNN_classifier.predict(counts_test)

#print accuracy
print (accuracy_score(pred,labels_test))

0.943


In [30]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
import csv

counter = CountVectorizer()
counts_all = counter.fit_transform(new_data)

def classify(data, labels):
    """
    This function classifies unseen dataset.
    
    INPUT: list of job ostings where each element is a posting
    OUTPUT: prints the accuracy of the classification
    """
    
    data_prep = preprocess_data(data)
    cv_valid = CountVectorizer(vocabulary=counter.get_feature_names())
    valid_data = cv_valid.fit_transform(data_prep)
    pred = clf.predict(valid_data)
    print(accuracy_score(pred,labels))
    with open('pred_labels.txt', 'w', newline ='') as f:
        # using csv.writer method from CSV package
        write = csv.writer(f)
        for i in pred:
            write.writerows([[i]])
    
classify(post_valid, labels_valid)

0.9581749049429658
