# Exploring Data

In [77]:
import pandas as pd

In [78]:
g = open('data_postings.txt','r')
postings = list(map(lambda x:x[:-1],g.readlines()))
g.close()

g = open('data_labels.txt','r') # What we WANT to know!
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()

In [79]:
len(postings)

10526

In [80]:
postings[0]

'"Bachelor’s degree, or equivalent experience, in Computer Science, Engineering, Mathematics or a related field5+ years’ experience of Data platform implementation, including 3+ years of hands-on experience in implementation and performance tuning Kinesis/Kafka/Spark/Storm implementations.Experience with analytic solutions applied to the Marketing or Risk needs of enterprisesBasic understanding of machine learning fundamentals.Ability to take Machine Learning models and implement them as part of data pipeline5+ years of IT platform implementation experience.Experience with one or more relevant tools ( Flink, Spark, Sqoop, Flume, Kafka, Amazon Kinesis ).Experience developing software code in one or more programming languages (Java, JavaScript, Python, etc).Current hands-on implementation experience requiredAt Amazon Web Services (AWS), we’re hiring highly technical cloud computing architects and engineers to collaborate with our customers and partners on key engagements. Our consultants

In [81]:
labels[0]

'DS'

In [28]:
def pretty_print_review_and_label(i):
    print(labels[i] + "\t:\t" + postings[i][:80] + "...")
    
print("labels \t: \tpostings\n")
pretty_print_review_and_label(2137)
pretty_print_review_and_label(5816)
pretty_print_review_and_label(0)
pretty_print_review_and_label(5934)
pretty_print_review_and_label(5297)
pretty_print_review_and_label(4998)

labels 	: 	postings

DS	:	"City: Palo AltoState/Province: CaliforniaCountry: United StatesDepartment: Rese...
DS	:	"When you join VerizonVerizon is a leading provider of technology, communication...
DS	:	"Bachelor’s degree, or equivalent experience, in Computer Science, Engineering, ...
DS	:	"Company Overview: Lennox International (LII) is a leading global provider of in...
DS	:	"Data ScientistDo you want to build your brand by working for a leading consulti...
DS	:	"Job OverviewProvides expertise in statistical methodologies and project managem...


# Finding correlations and removing meaningless words

In [29]:
from collections import Counter
import numpy as np

In [30]:
se_counts = Counter()
ds_counts = Counter()
total_counts = Counter()

In [31]:
for posting in range(len(postings)):
    if labels[posting] == 'SE':
        for word in postings[posting].split(" "):
            se_counts[word] += 1
            total_counts[word] += 1
    else:
        for word in postings[posting].split(" "):
            ds_counts[word] += 1
            total_counts[word] += 1

In [32]:
# the counts of the most common words in se_counts
se_counts.most_common()[:10]

[('and', 154509),
 ('to', 84117),
 ('the', 68386),
 ('of', 64404),
 ('a', 49559),
 ('in', 47594),
 ('with', 43243),
 ('for', 31435),
 ('or', 26971),
 ('is', 22593)]

In [33]:
# the counts of the most common words in ds_counts
ds_counts.most_common()[:10]

[('and', 181858),
 ('to', 98522),
 ('the', 76322),
 ('of', 75216),
 ('in', 54041),
 ('a', 50989),
 ('with', 45047),
 ('data', 36878),
 ('for', 33969),
 ('or', 31630)]

In [34]:
se_ds_ratios = Counter()

for term, count in list(total_counts.most_common()):
    if count > 100:
        se_ds_ratio = se_counts[term] / float(ds_counts[term]+1)
        se_ds_ratios[term] = se_ds_ratio

In [35]:
print("se-to-ds ratio for 'and' = {}".format(se_ds_ratios["and"]))
print("se-to-ds ratio for 'to' = {}".format(se_ds_ratios["to"]))
print("se-to-ds ratio for 'analytical' = {}".format(se_ds_ratios["analytical"]))
print("se-to-ds ratio for 'Software' = {}".format(se_ds_ratios["Software"]))
print("se-to-ds ratio for 'Statistics' = {}".format(se_ds_ratios["Statistics"]))

se-to-ds ratio for 'and' = 0.849608762832744
se-to-ds ratio for 'to' = 0.8537803355561645
se-to-ds ratio for 'analytical' = 0.2119972733469666
se-to-ds ratio for 'Software' = 11.89622641509434
se-to-ds ratio for 'Statistics' = 0.008771929824561403


In [36]:
for word, ratio in list(se_ds_ratios.most_common()):
    if(ratio > 2):
        se_ds_ratios[word] = np.log(ratio)
    else:
        se_ds_ratios[word] = -np.log(1 / (ratio+0.01))

In [37]:
print("se-to-ds ratio for 'and' = {}".format(se_ds_ratios["and"]))
print("se-to-ds ratio for 'to' = {}".format(se_ds_ratios["to"]))
print("se-to-ds ratio for 'analytical' = {}".format(se_ds_ratios["analytical"]))
print("se-to-ds ratio for 'Software' = {}".format(se_ds_ratios["Software"]))
print("se-to-ds ratio for 'Statistics' = {}".format(se_ds_ratios["Statistics"]))

se-to-ds ratio for 'and' = -0.15127792018390745
se-to-ds ratio for 'to' = -0.14643678375730732
se-to-ds ratio for 'analytical' = -1.505090179406156
se-to-ds ratio for 'Software' = 2.4762212418528526
se-to-ds ratio for 'Statistics' = -3.9753926193607354


In [38]:
se_ds_ratios.most_common()[:30]

[('Tanzu', 5.262690188904886),
 ('Relic', 5.247024072160486),
 ('Cox', 5.087596335232384),
 ('Macy’s', 4.787491742782046),
 ('REST,', 4.42484663185681),
 ('.Net', 3.982792076316353),
 ('ASP.NET', 3.979681653901961),
 ('observability', 3.8110970868381857),
 ('Observability', 3.784189633918261),
 ('Android', 3.5094109663023882),
 ('Framework,', 3.506557897319982),
 ('.NET', 3.46989391294839),
 ('test-driven', 3.4231762883809305),
 ('TypeScript,', 3.349904087274605),
 ('frontend', 3.2843754224853225),
 ('Node.js', 3.2771447329921766),
 ('Selenium', 3.2386784521643803),
 ('MVC', 3.228826155721369),
 ('MVC,', 3.164067588373206),
 ('Angular', 3.0719214119115374),
 ('JavaScript', 3.0529898888225215),
 ('client-side', 3.044522437723423),
 ('iOS', 3.030417831541881),
 ('Ruby', 3.0226397264739155),
 ('microservice', 2.8779492378976075),
 ('CIS,', 2.849880396541428),
 ('bug', 2.817040484810615),
 ('React', 2.799311792380535),
 ('C#', 2.7653510549395506),
 ('JS', 2.7454377331738304)]

In [39]:
list(reversed(se_ds_ratios.most_common()))[0:30]

[('PCG', -4.605170185988092),
 ('AstraZeneca', -4.605170185988092),
 ('hypotheses', -4.605170185988092),
 ('descriptive', -4.605170185988092),
 ('ADaM', -4.605170185988092),
 ('Biostatistics,', -4.605170185988092),
 ('SDTM', -4.605170185988092),
 ('CDISC', -4.605170185988092),
 ('GSK', -4.605170185988092),
 ('Statistician', -4.605170185988092),
 ('Bayesian', -4.605170185988092),
 ('Scientist,', -4.605170185988092),
 ('Scientist"', -4.297181714570609),
 ('Economics,', -4.176796587083057),
 ('Scientist', -4.152888017834711),
 ('statistical,', -4.126766354617089),
 ('"Data', -4.096997083671232),
 ('SPSS,', -4.02440725496528),
 ('semester', -4.006513848849068),
 ('Statistical', -3.9984927586164183),
 ('Analyst"', -3.98214867143405),
 ('Statistics', -3.9753926193607354),
 ('sentiment', -3.9670827826111736),
 ('statistical', -3.8395106934183167),
 ('supervised', -3.787232232614358),
 ('causal', -3.730284369082059),
 ('Science"""', -3.6545547115728616),
 ('Statistics,', -3.619854608717692),
 

In [40]:
#! pip install nltk

import nltk
import re
from nltk.tokenize import word_tokenize
nltk.download('punkt')

new_data = []

for line in range(len(postings)):
    
    word_list = ''
    
    for word in postings[line].split(" "):
        if se_ds_ratios[word] > 1 or se_ds_ratios[word] < -1:
            word_list += word + ' '
    word_list = re.sub(r"[^a-zA-Z0-9]", " ", word_list.lower())
    word_list = list(set(word_tokenize(word_list)))
    sentence = " ".join(word_list)
    new_data.append(sentence)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tigra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Final input

In [70]:
new_data[0]

'scalability learning applied professional engagements machine kafka phd physics ml javascript sql scientists emr pytorch data risk analytics specialist sets software mathematics analytic models engineer answer tensorflow code marketing deep consultants web java spark'

In [71]:
labels[0]

'DS'

In [72]:
new_data[1]

'software learning ml iot scientists approaches data models machine environmental engineer c algorithms train embedded novel processing mobile'

In [73]:
labels[1]

'SE'

In [44]:
len(new_data)

10526

# Building the model

In [63]:
post_train, labels_train = new_data[:10000], labels[:10000]
post_test, labels_test = new_data[10000:], labels[10000:]

In [64]:
len(post_train)

10000

In [65]:
len(post_test)

526

In [83]:
#Build a counter based on the training dataset
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score

counter = CountVectorizer()
counter.fit(new_data)

CountVectorizer()

In [84]:
#count the number of times each term appears in a document and transform each doc into a count vector
counts_train = counter.transform(post_train)#transform the training data
counts_test = counter.transform(post_test)#transform the testing data

# Decision Tree Classifier

In [85]:
from sklearn.tree import DecisionTreeClassifier

#train classifier
clf = DecisionTreeClassifier()

#train all classifier on the same datasets
clf.fit(counts_train,labels_train)

#use hard voting to predict (majority voting)
pred=clf.predict(counts_test)

#print accuracy
print (accuracy_score(pred,labels_test))

0.9562737642585551


# K Neighbors Classifier

In [86]:
from sklearn.neighbors import KNeighborsClassifier
KNN_classifier=KNeighborsClassifier()
KNN_classifier.fit(counts_train,labels_train)
pred=KNN_classifier.predict(counts_test)
print (accuracy_score(pred,labels_test))

0.9524714828897338
