In [None]:
#!pip install --upgrade gensim

In [6]:
# imports
import pandas as pd
import numpy as np
from google.cloud import storage
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import nltk
import gensim.downloader as api

%matplotlib inline

In [7]:
# Get files from GCS bucket
BUCKET_NAME = 'salary-data'
client = storage.Client()
bucket = client.get_bucket(BUCKET_NAME)

blobs = bucket.list_blobs()
files = []
for blob in blobs:
    files.append(blob.name)
    
print(files)

['2011_Travel_to_Work_Areas_summary_statistics_V5.csv', 'Location_Tree.csv', 'Test_rev1.csv', 'Train_rev1.csv', 'Valid_rev1.csv', 'X_test_padded', 'X_train_padded', 'X_val_padded', 'data_location_buckets.csv', 'mean_benchmark.csv', 'random_forest_benchmark_test_rev1.csv', 'test.csv', 'vocab_size']


In [8]:
# Read data
df_TTWA = pd.read_csv('gs://{}/{}'.format(BUCKET_NAME,files[0]))
df_train = pd.read_csv('gs://{}/{}'.format(BUCKET_NAME,files[3]))

In [None]:
from functions_6 import TTWA_county_feature
df_loc = TTWA_county_feature(df_train,df_TTWA,True) 


In [None]:
TTWA_county = df_loc.TTWA_County
df_all = pd.concat([df_train, TTWA_county], axis = 1)
df_all

In [None]:
client = storage.Client()
bucket = client.get_bucket(BUCKET_NAME)
bucket.blob('data_location_buckets.csv').upload_from_string(df_all.to_csv(), 'text/csv')

## Deal with NaN values

In [10]:
# Fill in missing values for Contract and Company
df_all =df_train
df_all.ContractType = df_train.ContractType.fillna('Missing')
df_all.ContractTime = df_train.ContractTime.fillna('Missing')
df_all.Company = df_train.Company.fillna('Missing')
# Drop other NaN values
df_all = df_all.dropna()
df_all.isnull().sum()

Id                    0
Title                 0
FullDescription       0
LocationRaw           0
LocationNormalized    0
ContractType          0
ContractTime          0
Company               0
Category              0
SalaryRaw             0
SalaryNormalized      0
SourceName            0
dtype: int64

## NLP of Description

In [9]:
import functions_12 as fn
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split

In [11]:
df_train = df_all

In [12]:
def first_NLP(df):
    word_list = []
    for index, row in df.iterrows():
        bag = row.FullDescription
        ns = fn.remove_stop_words(bag, unique = False)
        lemma = fn.lemmatize_words(ns)
        word_list.append(lemma)
    return word_list

In [13]:
# Create a train, val, test split

X_train_full, X_test, y_train_full, y_test = train_test_split(
                                df_train.drop(['SalaryNormalized','SalaryRaw'], axis =1),
                                df_train.SalaryNormalized, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(
                                                X_train_full,
                                                y_train_full, test_size=0.2, random_state=42)

In [14]:
# Remove grammer, capitals, and stop words
X_train_words = first_NLP(X_train)
X_val_words = first_NLP(X_val)
X_test_words = first_NLP(X_test)

In [15]:
# create dictionary of only the TRAIN text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_words)
word_dictionary = tokenizer.word_index
size_of_vocabulary=len(tokenizer.word_index) + 1 #+1 for padding

In [None]:
# Convert text to indexed list
X_train_seq = tokenizer.texts_to_sequences(X_train_words)
X_val_seq = tokenizer.texts_to_sequences(X_val_words)
X_test_seq = tokenizer.texts_to_sequences(X_test_words)

In [17]:
# Pad sequences

max_length = fn.get_max_length(X_train_seq, X_test_seq, X_val_seq)

X_train_padded = tf.keras.preprocessing.sequence.pad_sequences(X_train_seq,maxlen=max_length, padding='post')
X_test_padded = tf.keras.preprocessing.sequence.pad_sequences(X_test_seq,maxlen=max_length, padding='post')
X_val_padded = tf.keras.preprocessing.sequence.pad_sequences(X_val_seq,maxlen=max_length, padding='post')

## Save/ Load data

In [20]:
import json
data = tokenizer.word_index
file = './word_index.json' 

In [None]:
with open(file, 'w') as f: 
    json.dump(data, f)

In [21]:
with open(file) as f:
    data = json.load(f)

In [32]:
# df_X_train =  pd.DataFrame(X_train_padded)
# df_X_test = pd.DataFrame(X_test_padded)
# df_X_val = pd.DataFrame(X_val_padded)
# df_size_vocab = pd.DataFrame([size_of_vocabulary])

# bucket.blob('X_train_padded').upload_from_string(df_X_train.to_csv())
# bucket.blob('X_test_padded').upload_from_string(df_X_test.to_csv())
# bucket.blob('X_val_padded').upload_from_string(df_X_val.to_csv())
# bucket.blob('vocab_size').upload_from_string(df_size_vocab.to_csv())

df_X_train = pd.read_csv('gs://{}/{}'.format(BUCKET_NAME,'X_train_padded'))
df_X_test = pd.read_csv('gs://{}/{}'.format(BUCKET_NAME,'X_test_padded'))
df_X_val = pd.read_csv('gs://{}/{}'.format(BUCKET_NAME,'X_val_padded'))
df_size_vocab = pd.read_csv('gs://{}/{}'.format(BUCKET_NAME,'vocab_size'))

X_train_padded = df_X_train.to_numpy()[:,1:]
X_test_padded = df_X_test.to_numpy()[:,1:]
X_val_padded = df_X_val.to_numpy()[:,1:]
size_of_vocabulary = df_size_vocab.to_numpy()[0][1]

## Load the GloVe embeddings and make a dictionary

In [51]:
# Load word embeddings from GloVe
glove_model = api.load('glove-twitter-25')

In [110]:
embeddings_index[word] = dict()
glove_dict = glove_model.vocab

for key, value in glove_dict.items():
    word = key
    coef = glove_model[key]
    embeddings_index[word] = coef

In [114]:
# print(glove_model.vectors[value.index]) 
# print(glove_model[key]) # array
# print(key)# word
# print(value.index) # 
# print(embeddings_index['i'])
# print(glove_model['i'])

In [158]:
# create a weight matrix for words in training docs
embedding_matrix = np.zeros((size_of_vocabulary, 25))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

## Create model

In [184]:
from tensorflow import keras
import tensorflow as tf
from keras import layers

In [3]:
model =Sequential()
model.add(layers.Embedding(size_of_vocabulary, 25, weights=[embedding_matrix], input_length=max_length, trainable=False))
model.add(layers.Flatten())
#model.add(layers.Dense(10, activation = 'relu'))
model.add(layers.Dense(1, activation='linear'))
model.compile(loss='mean_squared_error', optimizer='adam')
print(model.summary())

NameError: name 'Sequential' is not defined

## Train model 

In [2]:
# fit the model
model.fit(X_train_padded, y_train.values, epochs=10, verbose=0,
         validation_data=(X_val_padded, y_val.values))

NameError: name 'model' is not defined

## Evaluate model 

In [None]:
# evaluate the model
loss = model.evaluate(X_test_padded, y_val.values, verbose=0)
print('MSE: {}'.format(loss))