In [None]:
import pandas as pd
import numpy as np
import requests
import bs4
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
import tqdm
from tqdm import tqdm_notebook
import time
from time import sleep
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer

In [None]:
cities = ['Toronto%2C+ON', 'Waterloo%2C+ON', 'Montreal%2C+QC', 'Vancouver%2C+BC', 'Calgary%2C+AB', 'Ottawa%2C+ON', 'Québec+City%2C+QC', 'Brampton%2C+ON', 'Halifax%2C+NS', 'Hamilton%2C+ON', 'Surrey%2C+BC', 'Victoria%2C+BC', 'Kitchener%2C+ON', 'Winnipeg%2C+MB', 'Edmonton%2C+AB']
len(cities)

In [None]:
max_results_per_city = 1000
header = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" ,'referer':'https://www.google.com/'}

results = []
for i in tqdm.notebook.tqdm(range(len(cities))):
    for start in range(0, max_results_per_city, 100):
        url = "https://ca.indeed.com/jobs?as_and=data+scientist&as_phr=&as_any=&as_not=&as_ttl=&as_cmp=&jt=all&st=&salary=&radius=25&l="+cities[i]+"&fromage=any&limit=100&start="+str(start)+"&sort=&psf=advsrch"
        html = requests.get(url, headers = header)
        soup = BeautifulSoup(html.content, 'html.parser')
        if(len(soup.find_all('script', {'src':'https://www.google.com/recaptcha/api.js'}))==1):
            print('Captcha error(', cities[i],'): Retry in some time or use proxy.')
            break
        for row in soup.find_all('div', {'class':'jobsearch-SerpJobCard unifiedRow row result'}):
            results.append(row)
        sleep(2)

In [None]:
results_backup = results

In [None]:
jobs = pd.DataFrame(columns=['location', 'title', 'company', 'salary', 'summary'])

In [None]:
def get_loc(row):
    try:
        return row.find('span', {'class':'location'}).text
    except: return 'NA'


def get_comp(row):
    try:
        return row.find('span', {'class':'company'}).text.replace('\n','')
    except: return 'NA'
    
def get_job(row):
    try:
        return row.find('a', {'data-tn-element':'jobTitle'}).text.replace('\n','')
    except: return 'NA'
    
        
def get_sal(row):
    try:
        return row.find('span', {'class':'salaryText'}).text.replace('\n','')
    except: return 'NA'
    
    
def get_desc(row):
    try:
        return row.find('div', {'class':'summary'}).text.replace('\n','')
    except: return 'NA'


In [None]:
for result in results:
    location = get_loc(result)
    title = get_job(result)
    company = get_comp(result)
    salary = get_sal(result)
    desc = get_desc(result)
    jobs.loc[len(jobs)] = [location, title, company, salary, desc]

In [None]:
jobs.head(5)

In [None]:
jobs.shape

In [None]:
jobs = jobs.drop_duplicates()

In [None]:
jobs.shape

In [None]:
jobs.to_csv('scraped_results.csv', index=False, encoding='utf-8')

In [None]:
salaries = pd.read_csv('scraped_results.csv')

In [None]:
salaries = scrapped_data[scrapped_data.salary.notnull()]

In [None]:
salaries.shape

In [None]:
salaries

In [None]:
salaries = salaries[salaries.location != 'Münchberg']

In [None]:
preprocess_salary = []

for i in salaries.salary:        
    if 'an hour' in i:
        i = i.replace('an hour', '').replace('$', '').replace('from', '').replace('From','')
        a = i.split('-')
        if(len(a) == 2):
            a = [float(x.strip().replace(',','')) for x in a]
            preprocess_salary.append(np.mean([float(b) for b in a])*8*300)
        else:
            a[0] = a[0].strip().replace(',','')
            preprocess_salary.append(float(a[0])*8*300)
            
    elif 'a week' in i:
        i = i.replace('a week', '').replace('$', '').replace('from', '').replace('From','')
        a = i.split('-')
        print(a)
        if(len(a) == 2):
            a = [float(x.strip().replace(',','')) for x in a]
            preprocess_salary.append(np.mean([float(b) for b in a])*52)
        else:
            a[0] = a[0].strip().replace(',','')
            preprocess_salary.append(float(a[0])*52)
    
    elif 'a month' in i:
        i = i.replace('a month', '').replace('$', '').replace('from', '').replace('From','')
        a = i.split('-')
        if(len(a) == 2):
            a = [float(x.strip().replace(',','')) for x in a]
            preprocess_salary.append(np.mean([float(b) for b in a])*12)
        else:
            a = [float(x.strip().replace(',','')) for x in a]
            preprocess_salary.append(float(a[0])*12)
    elif 'a year' in i:
        i = i.replace('a year', '').replace('$', '').replace('from', '').replace('From','')
        a = i.split('-')
        if len(a) == 2:
            a = [float(x.strip().replace(',','')) for x in a]
            preprocess_salary.append(np.mean([float(b) for b in a]))
        else:
            a[0] = a[0].strip().replace(',','')
            preprocess_salary.append(float(a[0]))


In [None]:
for i in salaries.salary:
    if len(i.split('-')) != 1 and len(i.split('-')) != 2:
        print(i)

In [None]:
len(preprocess_salary)

In [None]:
salaries.salary = preprocess_salary

In [None]:
salaries.shape

In [None]:
salaries.duplicated().sum()

In [None]:
preprocessed_data = salaries[salaries['location'].notna()]

In [None]:
preprocessed_data.to_csv('final_data.csv', index=False, encoding='utf-8')

In [None]:
salaries = pd.read_csv('final_data.csv')

In [None]:
salaries.head()

In [None]:
median_salary = np.median(salaries.salary)
median_salary

In [None]:
salaries['high_salary'] = [1 if i > median_salary else 0 for i in salaries.salary]

In [None]:
salaries.head()
print(len(salaries))

#### Only Location

In [None]:
salaries.location.value_counts()

In [None]:
cities = []
states = []

for loc in salaries.location:
    items = loc.split(',')
    cities.append(items[0])
    states.append(items[1])


In [None]:
only_states = []
for state in states:
    only_states.append(re.search(r'\w+', state).group(0))
print(len(only_states))

In [None]:
salaries['city'] = cities
salaries['state'] = only_states

In [None]:
salaries.city = salaries.city + ", " + salaries.state

In [None]:
for i in salaries.state:
    if len(i) > 2:
        print(i)

In [None]:
salaries.head()

In [None]:
salaries.city.nunique()

In [None]:
salaries.city.value_counts()

In [None]:
city_dummies = pd.get_dummies(salaries.city)

X_city = city_dummies
y_city = salaries.high_salary

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_city, y_city, test_size=0.3, random_state=90)

In [None]:
rfc = RandomForestClassifier(n_estimators=300, random_state=90)
rfc.fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)
acc = accuracy_score(y_test, rfc_pred)
print ("Accuracy Score:", acc.round(3))

s = cross_val_score(rfc, X_city, y_city, cv=10, n_jobs=-1)
print ("Cross Validation Score:\t{:0.3} ± {:0.3}".format(s.mean().round(3), s.std().round(3)))

In [None]:
feature_importances = pd.DataFrame(rfc.feature_importances_,
                                   index = X_city.columns).reset_index()
feature_importances.columns = ['feature', 'importance']

feature_medians = []
for i in X_city.columns:
    feature_medians.append(np.median(salaries[salaries.city == i].salary))

feature_importances['median_salary'] = feature_medians
feature_importances['over_or_under'] = [1 if i > median_salary else 0 for i in feature_importances.median_salary]

feature_importances.sort_values('importance', ascending=False).head(15)

#### Only summary

In [None]:
salaries_w_desc = salaries[salaries.summary.notnull()]

X_summ = salaries_w_desc.summary
y_summ = salaries_w_desc.high_salary

In [None]:
cv = CountVectorizer(stop_words="english")
cv.fit(X_summ)

In [None]:
len(cv.get_feature_names())

In [None]:
X_summ_trans = pd.DataFrame(cv.transform(X_summ).todense(), columns=count_vec.get_feature_names())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(np.asmatrix(X_summ_trans), y_summ, test_size=0.3,
                                                    random_state=59, stratify=y_summ)

In [None]:
word_counts = X_summ_trans.sum(axis=0)
word_counts.sort_values(ascending = False).head(20)

In [None]:
word_counts.to_csv('indeed-words.csv', encoding='utf-8')

In [None]:
rfc = RandomForestClassifier(200, random_state=59)
rfc.fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)
acc = accuracy_score(y_test, rfc_pred)
print("Accuracy Score:", acc.round(3))

s = cross_val_score(rfc, X_summ_trans.values, y_summ.values, cv=10, n_jobs=-1)
print("Cross Validation Score: {:0.3} ± {:0.3}".format(s.mean().round(3), s.std().round(3)))

In [None]:
feature_importances = pd.DataFrame(rfc.feature_importances_,
                                   index = X_summ_trans.columns).reset_index()
feature_importances.columns = ['feature', 'importance']

feature_medians = []
feature_means = []
for i in X_summ_trans.columns:
    feature_medians.append(np.median(salaries_w_desc[salaries_w_desc.summary.str.lower().str.contains(i)].salary))
    feature_means.append(np.mean(salaries_w_desc[salaries_w_desc.summary.str.lower().str.contains(i)].salary))


feature_importances['median_salary'] = feature_medians
feature_importances['mean_salary'] = feature_means
feature_importances['over_or_under'] = [1 if i > median_salary else 0 for i in feature_importances.median_salary]

feature_importances.sort_values('importance', ascending=False).head(20)

#### Only title

In [None]:
salaries_w_desc = salaries[salaries.summary.notnull()]

X_title = salaries_w_desc.title
y_title = salaries_w_desc.high_salary

In [None]:
cv = CountVectorizer(stop_words="english")
cv.fit(X_title)

In [None]:
X_title_trans = pd.DataFrame(cv.transform(X_title).todense(), columns=cv.get_feature_names())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_title_trans, y_title, test_size=0.3, random_state=59)

In [None]:
rfc = RandomForestClassifier(200, random_state=59)
rfc.fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)
acc = accuracy_score(y_test, rfc_pred)
print("Accuracy Score:", acc.round(3))

s = cross_val_score(rfc, X_title_trans.values, y_title.values, cv=10, n_jobs=-1)
print("Cross Validation Score: {:0.3} ± {:0.3}".format(s.mean().round(3), s.std().round(3)))

In [None]:
feature_importances = pd.DataFrame(rfc.feature_importances_,
                                   index = X_title_trans.columns).reset_index()
feature_importances.columns = ['feature', 'importance']

feature_medians = []
feature_means = []
for i in X_title_trans.columns:
    feature_medians.append(np.median(salaries_w_desc[salaries_w_desc.title.str.lower().str.contains(i)].salary))
    feature_means.append(np.mean(salaries_w_desc[salaries_w_desc.title.str.lower().str.contains(i)].salary))


feature_importances['median_salary'] = feature_medians
feature_importances['mean_salary'] = feature_means
feature_importances['over_or_under'] = [1 if i > median_salary else 0 for i in feature_importances.median_salary]

feature_importances.sort_values('importance', ascending=False).head(20)

#### Combining Title CV, Summary CV, and Location

In [None]:
salaries_w_desc = data[data.summary.notnull()].reset_index()
city_dummies = pd.get_dummies(salaries_w_desc.city)

X = pd.concat([city_dummies, X_title_trans, X_summ_trans], axis=1)
y = salaries_w_desc.high_salary

In [None]:
print(X.shape)
print(y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=0.3, random_state=1234, stratify=y)

In [None]:
rfc = RandomForestClassifier(500, random_state=59)
rfc.fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)
acc = accuracy_score(y_test, rfc_pred)
print("Accuracy Score:", acc.round(3))

s = cross_val_score(rfc, X.values, y.values, cv=10, n_jobs=-1)
print("Cross Validation Score: {:0.3} ± {:0.3}".format(s.mean().round(3), s.std().round(3)))

In [None]:
feature_importances = pd.DataFrame(rfc.feature_importances_,
                                   index = X.columns).reset_index()
feature_importances.columns = ['feature', 'importance']

feature_medians = []
for i in city_dummies.columns:
    feature_medians.append(np.median(salaries[salaries.city == i].salary))
for i in X_title_trans.columns:
    feature_medians.append(np.median(salaries_w_desc[salaries_w_desc.title.str.lower().str.contains(i)].salary))
for i in X_summ_trans.columns:
    feature_medians.append(np.median(salaries_w_desc[salaries_w_desc.summary.str.lower().str.contains(i)].salary))

feature_importances['median_salary'] = feature_medians
feature_importances['over_or_under'] = [1 if i > median_salary else 0 for i in feature_importances.median_salary]

feature_importances.sort_values('importance', ascending=False).head(20)