In [4]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.cloud import storage
import nltk
from nltk.tokenize import RegexpTokenizer

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

from keras.layers import Dense
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras import layers
from keras import Model
from keras import Sequential
from keras import Input

import functions_14 as fn

%matplotlib inline

Using TensorFlow backend.


# Use multiple features for ML model

## First prep the feature data

In [5]:
# Load data
BUCKET_NAME = 'salary-data'
client = storage.Client()
bucket = client.get_bucket(BUCKET_NAME)
df_train = pd.read_csv('gs://{}/{}'.format(BUCKET_NAME,'data_location_buckets.csv'))

In [6]:
df_train.columns

Index(['Unnamed: 0', 'Id', 'Title', 'FullDescription', 'LocationRaw',
       'LocationNormalized', 'ContractType', 'ContractTime', 'Company',
       'Category', 'SalaryRaw', 'SalaryNormalized', 'SourceName',
       'TTWA_County'],
      dtype='object')

In [7]:
df_sub = df_train[['Title', 'FullDescription','ContractType', 'ContractTime', 'Company',
       'Category', 'TTWA_County','SalaryNormalized']]
df_sub.ContractType = df_sub.ContractType.fillna('Missing')
df_sub.ContractTime = df_sub.ContractTime.fillna('Missing')
df_sub.Company = df_sub.Company.fillna('Missing')
# Drop other NaN values
df_sub = df_sub.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [8]:
df_sub.columns

Index(['Title', 'FullDescription', 'ContractType', 'ContractTime', 'Company',
       'Category', 'TTWA_County', 'SalaryNormalized'],
      dtype='object')

## NLP of Description

In [9]:
X_words = fn.first_NLP(df_sub)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_words)
word_dictionary = tokenizer.word_index
size_of_vocabulary=len(tokenizer.word_index) + 1 #+1 for padding
X_words_seq = tokenizer.texts_to_sequences(X_words)

lengths = []
for x in X_words_seq:
    lengths.append(len(x))
max_length = max(lengths)

X_words_padded = tf.keras.preprocessing.sequence.pad_sequences(X_words_seq,maxlen=max_length, padding='post')

In [10]:
X_words_padded.shape

(195896, 1267)

## Hot encode location

In [11]:
enc = OneHotEncoder()
X_location = df_sub.TTWA_County.values
X_location = enc.fit_transform(X_location.reshape(-1, 1))

## Create train/test split

In [12]:
y = df_sub.SalaryNormalized

In [13]:
X_train_full, X_test, y_train_full, y_test1 = train_test_split( X_words_padded, y, test_size=0.2, random_state=42)

X_train_words, X_val_words, y_train, y_val = train_test_split( X_train_full, y_train_full, test_size=0.2, random_state=42)

In [14]:
X_train_full, X_test, y_train_full, y_test2 = train_test_split( X_location, y, test_size=0.2, random_state=42)

X_train_location, X_val_location, y_train, y_val = train_test_split( X_train_full, y_train_full, test_size=0.2, random_state=42)

## Build functional Model with multiple inputs

In [15]:
from keras.layers import Embedding, Flatten, concatenate

In [16]:
Input_embed = Input(shape = (X_train_words.shape[1],))
Input_location = Input(shape = (X_train_location.shape[1],))

# First build the Embedding branch

x = Embedding(size_of_vocabulary, 25,  input_length=max_length)(Input_embed)
x = Conv1D(filters=100, kernel_size=5, activation='relu')(x)
x = MaxPooling1D(pool_size=2)(x)
x = Flatten()(x)
x = Dense(30, activation = 'relu')(x)
x = Model(inputs = Input_embed, outputs = x)

# Build the location branch

y = Dense(10, activation='relu')(Input_location)
y = Dense(1, activation='linear')(y)
y = Model(inputs = Input_location, outputs = y)

# Combine the models

combined = concatenate([x.output, y.output])

z = Dense(2, activation="relu")(combined)
z = Dense(1, activation="linear")(z)

model = Model(inputs=[x.input, y.input], outputs=z)

In [17]:
model.compile(loss="mean_squared_error", optimizer='Adam')

model.fit(
    [X_train_words, X_train_location.todense()], y_train,
    validation_data=([X_val_words, X_val_location.todense()], y_val),
    epochs=10, batch_size=8)

Epoch 1/10
 2646/15672 [====>.........................] - ETA: 20:03 - loss: 243563728.0000

KeyboardInterrupt: 