In [178]:
# imports
import pandas as pd
import numpy as np
from google.cloud import storage
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

import nltk

import geopy
from geopy.geocoders import Nominatim

import folium
from folium import FeatureGroup, LayerControl, Map, Marker
from folium.plugins import HeatMap
from folium.plugins import TimestampedGeoJson
from folium.plugins import MarkerCluster

%matplotlib inline

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (8,6)

In [None]:
#!pip install geopy
#!pip install folium
#!pip install nltk

# Load data

In [179]:
# Get files from GCS bucket

BUCKET_NAME = 'salary-data'

client = storage.Client()
bucket = client.get_bucket(BUCKET_NAME)

blobs = bucket.list_blobs()
files = []
for blob in blobs:
    files.append(blob.name)

print(files)

['Location_Tree.csv', 'Test_rev1.csv', 'Train_rev1.csv', 'Valid_rev1.csv', 'mean_benchmark.csv', 'random_forest_benchmark_test_rev1.csv', 'test.csv']


In [180]:
# Read train set from bucket
train_set_name = files[2]

df_train = pd.read_csv('gs://{}/{}'.format(BUCKET_NAME,train_set_name))

In [None]:
df_benchmark = pd.read_csv('gs://{}/{}'.format(BUCKET_NAME,files[4]))
df_location_tree = pd.read_csv('gs://{}/{}'.format(BUCKET_NAME,files[0]))
df_rf = pd.read_csv('gs://{}/{}'.format(BUCKET_NAME,files[-2]))

In [None]:
df_train.head()
#df_location_tree

# Visualise and analyse

## Check salary distribution

In [None]:
fig = df_train.hist(column = 'SalaryNormalized', bins = 10)


## Check for any missing values

In [None]:
# Check for NaN

df_train.isnull().sum()
#df_train.isna().sum()

#df_train.info()

## Check the locations

In [None]:
raw_cities = df_train.LocationRaw.unique()
norm_cities = df_train.LocationNormalized.unique()

print('Number of unique raw locations = {}'.format(len(raw_cities)))
print('Number of unique normalised locations = {}'.format(len(norm_cities)))

In [None]:
raw_counts = df_train.LocationRaw.value_counts()
norm_counts = df_train.LocationNormalized.value_counts()

print('Top raw locations \n')
print(raw_counts[0:10] )
print('\n')
print('Top normalised locations \n')
print(norm_counts[0:10])


In [None]:
geolocator = Nominatim(user_agent='myapplication')


In [None]:
location = geolocator.geocode("Surrey")
coordinates = [location.latitude,location.longitude]
print(coordinates)

In [None]:
drop_map = folium.Map(location = [51.50,-0.13],zoom_start = 5,)

folium.CircleMarker(coordinates,
                    radius = 1.5,                    
                    color = 'red', 
                    fill_opacity=0.5
                   ).add_to(drop_map)
drop_map

## Check the Job titles/ company/ category

In [None]:
uni_title = df_train.Title.unique()
uni_cat = df_train.Category.unique()
uni_comp = df_train.Company.unique()

print('Number of unique titles= {}'.format(len(uni_title)))
print('Number of unique categories = {}'.format(len(uni_cat)))
print('Number of unique companies = {}'.format(len(uni_comp)))

In [None]:
title_counts = df_train.Title.value_counts()
category_counts = df_train.Category.value_counts()
company_counts = df_train.Company.value_counts()

# norm_counts = df_train.LocationNormalized.value_counts()


print('Top titles \n')
print(title_counts[0:10] )
print('\n')
print('Top categories \n')
print(category_counts[0:10])
print('\n')
print('Top companies \n')
print(company_counts[0:10])

## Check the difference between contract time and types

In [None]:
type_contract = df_train.ContractType.unique()
time_contract = df_train.ContractTime.unique()

print(type_contract)
print(time_contract)

In [None]:
df_train.columns

In [None]:
# Remove nans and check if there is a difference in average salary between contracts

df_no_nan_type = df_train[df_train.ContractType.notna()]
df_no_nan_time = df_train[df_train.ContractTime.notna()]

df_avg_type = df_no_nan_type.groupby(['ContractType']).mean()
df_avg_time = df_no_nan_time.groupby(['ContractTime']).mean()


In [None]:
df_avg_type = df_avg_type.reset_index()
df_avg_time = df_avg_time.reset_index()

In [None]:
plt.rcParams["figure.figsize"] = (15,5)
fig, axes = plt.subplots(nrows=1, ncols=2)

df_avg_type.plot.bar(x ='ContractType',y='SalaryNormalized', ax = axes[0])

df_avg_time.plot.bar(x ='ContractTime',y='SalaryNormalized', ax = axes[1])


 No big difference between contract or permanent. There does seem to be a difference between full time and partime.

## Filling in NaN values

In [None]:
df_train.ContractType = df_train.ContractType.fillna('Missing')
df_train.ContractTime = df_train.ContractTime.fillna('Missing')
df_train.Company = df_train.Company.fillna('Missing')
df_train = df_train.dropna()
df_train.isnull().sum()


# NLP of the description

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize  
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer 

stop_words = stopwords.words('english')
stop_words.append('k')
tokenizer = RegexpTokenizer(r'\w+')

## Define functions for preprocessing

In [181]:
def make_train_test(df_train, size, split):
    
    train_size =slice( 0,round(size*split))
    test_size = slice(round(size*split)+1 ,size+1)

    description = df_train.FullDescription
    Salary = df_train.SalaryNormalized

    train = description[train_size]
    test = description[test_size]

    y_train = Salary[train_size]
    y_test = Salary[test_size]
    
    return( train, test , y_train, y_test)

In [None]:
#remove stop words
def remove_stop_words(bag, unique):
    # tokenize
    tokenizer = RegexpTokenizer(r'\w+')
    word_tokens = tokenizer.tokenize(bag)
    
    words = []
    for word in word_tokens:
        words.append(word.lower())
    # Get unique words only
    if unique:
        words_set = set(words)
        words = list(words_set)
    
    no_stop_words = [w for w in words if not w in stop_words] 
    
    return no_stop_words

In [None]:
# Lemmatize the words  

def lemmatize_words(no_stop_words):
    
    lemma = WordNetLemmatizer() 
    lemma_words = []

    for w in no_stop_words:
        lemma_words.append(lemma.lemmatize(w))
    
    return lemma_words

In [None]:
def create_vocab(train):
    vocab = []
    for index, row in train.items():

        no_stop_words = remove_stop_words(row, unique=False)
        lemma_words = lemmatize_words(no_stop_words)

        vocab.append(lemma_words)
    return vocab

In [183]:
def get_max_length(X_train, X_test):
    lengths =[]
    for x in X_test:
        lengths.append(len(x))
    for x in X_train:
        lengths.append(len(x))

    max_length = max(lengths)
    
    return max_length

In [235]:
def prep_data(train,test): 
    # Preprocess train set and make a vocabulary
    vocab = create_vocab(train)

    # create dictionary of text
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(vocab)

    # Preprocess train and test set
    pre_train = create_vocab(train)
    pre_test = create_vocab(test)

    # Convert text to indexed list
    X_train = tokenizer.texts_to_sequences(pre_train)
    X_test = tokenizer.texts_to_sequences(pre_test)
    vocab = tokenizer.word_index
    
    return X_train, X_test, vocab

In [236]:
def padding(*argv): # X_train, X_test
    if len(argv) == 1:
        X_train = argv[0]
        
        lengths =[]
        for x in X_train:
            lengths.append(len(x))
            
        max_length = max(lengths)
            
        X_train_padded = tf.keras.preprocessing.sequence.pad_sequences(X_train,maxlen=max_length, padding='post')
        
        return X_train_padded
      
    else:
        X_train = argv[0]
        X_test = argv[1]
        max_length = get_max_length(X_train, X_test)
        
        X_train_padded = tf.keras.preprocessing.sequence.pad_sequences(X_train,maxlen=max_length, padding='post')
        X_test_padded = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=max_length, padding='post')
        
        return X_train_padded, X_test_padded

## Prepare input data

In [237]:
# Create train and test sets

size = 100
split = 0.8

train, test, y_train, y_test = make_train_test(df_train, size, split)

X_train, X_test, vocab = prep_data(train,test)

X_train_padded, X_test_padded = padding(X_train, X_test)

## Prepare target data

In [239]:
# Train on whether salary is > 25000 or not

y_binary_train = y_train < 25000
y_binary_train = (y_binary_train*1).values

y_binary_test = y_test < 25000
y_binary_test = (y_binary_test*1).values

In [240]:
# Check ratio of 0s and 1s

print('Train ratio is {}'.format(sum(y_binary_train)/len(y_binary_train)))
print('Test ratio is {}'.format(sum(y_binary_test)/len(y_binary_test)))

Train ratio is 0.4
Test ratio is 0.65


# Classification model

In [241]:
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Flatten
from keras.layers import Dense

max_len = X_train_padded.shape[1]
vocab_size = len(vocab) + 1

In [242]:
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_len))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())

Model: "sequential_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 390, 8)            17672     
_________________________________________________________________
flatten_14 (Flatten)         (None, 3120)              0         
_________________________________________________________________
dense_13 (Dense)             (None, 1)                 3121      
Total params: 20,793
Trainable params: 20,793
Non-trainable params: 0
_________________________________________________________________
None


## Train model

In [243]:
# fit the model
model.fit(X_train_padded, y_binary_train, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_binary_test, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 89.999998


# Regression Model

In [186]:
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [216]:
size = 1000
split = 1  # use a k-fold cross validation instead of a simple split

train, test, y_train, y_test = make_train_test(df_train, size, split)

X_train = prep_data(train)

X_train_padded = padding(X_train)

In [None]:
def regression_model(vocab_size = vocab_size,max_len = max_len):
    
    model = Sequential()
    model.add(Embedding(vocab_size, 8, input_length=max_len))
    model.add(Flatten())
    model.compile(loss='mean_squared_error', optimizer='adam')
    
    return model


In [None]:
# evaluate model
estimator = KerasRegressor(build_fn=regression_model, epochs=50, batch_size=1, verbose=0)
kfold = KFold(n_splits=10)

results = cross_val_score(estimator, X, Y, cv=kfold)

print("Baseline: %.2f (%.2f) MSE" % (results.mean(), results.std()))