In [None]:
'''!pip install tensorflow-gpu==2.0
!pip install tensorflow_hub
!pip install bert-for-tf2
!pip install sentencepiece'''

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
print("TF version: ", tf.__version__)
print("Hub version: ", hub.__version__)

In [None]:
import bert
from tensorflow.keras.models import Model       # Keras is the new high level API for TensorFlow
from tensorflow import keras
import math
import numpy as np
import pandas as pd
import pickle

In [None]:
train_file = "/content/trainNew.csv"
test_file = '/content/test.csv'
jdFile = '/content/trainJD.txt'
checkpoint_path = '/content/ckpt/'
max_seq_length = 512
bertOutDim = 768
outDim = 11
#epochs = 1000

In [None]:
'''import zipfile
import os
for file_name in os.listdir('/content/'):
  if file_name.endswith('.zip'):
    with zipfile.ZipFile(file_name,'r') as zip_dir:
      zip_dir.extractall(path='/content/')'''

In [None]:
'''import zipfile
import os
for file_name in os.listdir('/content/drive/My Drive/temp/'):
  if file_name.endswith('.zip'):
    with zipfile.ZipFile('/content/drive/My Drive/temp/'+file_name,'r') as zip_dir:
      zip_dir.extractall(path='/content/')'''

In [None]:
def getBERTModel():
    input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,name="input_word_ids")
    input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,name="input_mask")
    segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,name="segment_ids")

    bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",trainable=False)
    #bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1",trainable=False)
    
    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output])

    # Import tokenizer using the original vocab file
    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    FullTokenizer = bert.bert_tokenization.FullTokenizer
    tokenizer = FullTokenizer(vocab_file, do_lower_case)

    return {'model':model,'tokenizer':tokenizer}

In [None]:
def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    #print('len(tokens),max_seq_length)
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [None]:
def getEmbeddings(model,tokenizer,sentence): 
    stokens = tokenizer.tokenize(sentence)

    if len(stokens) > (max_seq_length - 2):
      stokens = stokens[:max_seq_length-2]

    stokens = ["[CLS]"] + stokens + ["[SEP]"]

    input_ids = get_ids(stokens, tokenizer, max_seq_length)
    #print(len(input_ids))
    input_masks = get_masks(stokens, max_seq_length)
    input_segments = get_segments(stokens, max_seq_length)

    '''print(input_masks)
    print(input_segments)'''

    pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])
    '''print('see')
    print(all_embs.shape)
    print(pool_embs.shape)'''
    # pool_ebmbs is an embeding of CLS token
    # all_embs contains embeding for words of input sentence.
    return pool_embs

In [None]:
'''def getSTrData(model,tokenizer,sentence):
    embs = getEmbeddings(model,tokenizer,sentence)
    return embs'''

In [None]:
def getTrainingData(model,tokenizer,nSentences):
    trainX = np.asarray([])
    #trainY = np.asarray([])
    n = len(nSentences)
    for i in range(n): 
        if i%100 == 0:
          print( 'Processing ',i,' out of ',n)

        senLen = len(nSentences[i].split())
        embs = getEmbeddings(model,tokenizer,nSentences[i])
        '''print(embs.shape)
        x = embs'''
        if trainX.shape[0] == 0:
            trainX = embs
            #trainY = y
        else:
            trainX = np.concatenate((trainX, embs), axis=0)
            #trainY = np.concatenate((trainY, y), axis=0)
    return trainX

In [None]:
def saveJD(df,fileName):
    JDs = list(df.job_description)
    print(len(JDs))
    with open(fileName, 'w',encoding="utf-8") as f:
        for strEle in JDs:
            ele = strEle.replace("\r", "")
            ele =  ele.replace("\n", "")
            ele = ele.strip().lower()
            f.write(ele)
            f.write('\n')

In [None]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
'''def createModel():
    model = keras.models.Sequential()
    model.add(keras.layers.Dense(400,input_dim=bertOutDim,activation="relu"))
    model.add(keras.layers.Dense(150,activation="relu"))
    #model.add(keras.layers.Dense(256,activation="relu"))
    model.add(keras.layers.Dense(outDim,activation="softmax"))
    model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=['accuracy'])
    return model'''

In [None]:
def createModel():
    model = keras.models.Sequential()

    model.add(keras.layers.Dense(500,input_dim=bertOutDim,activation="relu"))
    model.add(keras.layers.BatchNormalization())

    model.add(keras.layers.Dense(256,activation="relu"))
    model.add(keras.layers.BatchNormalization())

    model.add(keras.layers.Dense(128,activation="relu"))
    model.add(keras.layers.BatchNormalization())

    model.add(keras.layers.Dense(64,activation="relu"))
    model.add(keras.layers.BatchNormalization())

    '''model.add(keras.layers.Dense(30,activation="relu"))
    model.add(keras.layers.BatchNormalization())'''

    model.add(keras.layers.Dense(outDim,activation="softmax"))

    optimizer = keras.optimizers.Adam(learning_rate=0.0001)
    model.compile(loss="categorical_crossentropy",optimizer=optimizer,metrics=['accuracy',f1_m])
    #model.compile(loss="categorical_crossentropy",optimizer=optimizer,metrics=['accuracy'])
    return model

In [None]:
def getGroundTruth(id):
    gt = [0]*(id) + [1] + [0]*(outDim-id-1)
    #print('gt len:',len(gt))
    return np.asarray(gt)

In [None]:
def getGTMat(labels):
    labMat = np.asarray([])
    for i in range(labels.shape[0]):
        gt = getGroundTruth(labels[i])
        if labMat.shape[0] == 0:
            labMat = np.asarray([gt])
        else:
            labMat = np.concatenate((labMat,np.asarray([gt])), axis=0)
    return labMat

In [None]:
def getMax(modelOutInst):
    #print(modelOutInst.shape)
    score = np.amax(modelOutInst)
    ind = np.where(modelOutInst == score)[0][0]
    #print(ind)
    return ind

In [None]:
def getAllpd(modelOut):
    indLi = []
    for i in range(modelOut.shape[0]):
        ind = getMax(modelOut[i])
        indLi.append(ind)
    return np.asarray(indLi)

# **Embedding Generation**

In [None]:
df = pd.read_csv(train_file, index_col=False)
saveJD(df,jdFile)

In [None]:
outModel = getBERTModel()
model = outModel['model']
tokenizer = outModel['tokenizer']

In [None]:
#nSentence = ["short cuts make long delays","short cuts make long delays"]

In [None]:
f = open(jdFile, 'r')
nSentence = f.read().splitlines()
f.close()

In [None]:
trainX = getTrainingData(model,tokenizer,nSentence)

In [None]:
print(trainX.shape)

In [None]:
#np.savetxt('trainX.csv',trainX, delimiter=',',fmt='%8.4f')
trainX = np.loadtxt('trainX.csv',delimiter=',')

# **Dataset Generation**

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df.category)
pickle.dump(le, open('/content/catlabelFit.sav', 'wb'))

In [None]:
cat = le.transform(df.category)
print(cat.shape)
cat = getGTMat(cat)
print(cat.shape)
print(trainX.shape)
saveData = np.concatenate((trainX,cat),axis=1)
print(saveData.shape)
np.savetxt('/content/catTrain.csv',saveData, delimiter=',',fmt='%8.4f')

In [None]:
le.fit(df.job_type)
pickle.dump(le, open('/content/typlabelFit.sav', 'wb'))

In [None]:
typ = le.fit_transform(df.job_type)
print(typ.shape)
saveData = np.concatenate((trainX,np.asarray([typ]).T),axis=1)
print(saveData.shape)
np.savetxt('/content/typTrain.csv',saveData, delimiter=',',fmt='%8.4f')

# **Training**

In [None]:
loadData = np.loadtxt('/content/catTrain.csv',delimiter=',')
labels = loadData[:,bertOutDim:]
dataset = loadData[:,:bertOutDim]

In [None]:
print(dataset.shape)
print(labels.shape)

In [None]:
from sklearn.model_selection import train_test_split
trainX,testX, trainY, testY = train_test_split(dataset, labels,train_size = 0.8, test_size=0.2, random_state=4,stratify = labels)

In [None]:
trainX = dataset
trainY = labels

In [None]:
'''model = createModel()
model.summary()'''

In [None]:
epochs = 500
model = createModel()

checkpoint_path = '/content/ckpt/model.ckpt'
#checkpoint_path = '/content/drive/My Drive/temp/model.ckpt'
model.load_weights(checkpoint_path)
checkpoint_path = '/content/drive/My Drive/temp/model.ckpt'
#checkpoint_path = '/content/ckpt/model.ckpt'

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,save_weights_only=True,verbose=1)
history_callback = model.fit(trainX,trainY,epochs=epochs,callbacks=[cp_callback])
loss_history = history_callback.history["loss"]
np.savetxt("loss_history.txt", np.array(loss_history), delimiter=",",fmt='%8.4f')

In [None]:
acc = model.evaluate(testX, testY)
print(acc)

In [None]:
loadData = np.loadtxt('/content/typTrain.csv',delimiter=',')
labels = loadData[:,-1]
dataset = loadData[:,:-1]

In [None]:
print(dataset.shape)
print(labels.shape)

In [None]:
trainX,testX, trainY, testY = train_test_split(dataset, labels,train_size = 0.8, test_size=0.2, random_state=42,stratify = labels)

In [None]:
svm_c = 0.01
svm_gamma = 10
svm_kernal = 'linear'

estimators = 100

In [None]:
#model = svm.SVC(C=10,kernel='rbf',gamma=0.01)
#model = svm.SVC(C=svm_c,kernel=svm_kernal,gamma=svm_gamma,class_weight='balanced')
#model = svm.SVC(C=svm_c,kernel=svm_kernal,class_weight='balanced')

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=estimators)

In [None]:
print("training started...")
model.fit(trainX,trainY)
pickle.dump(model, open('/content/typModel.sav', 'wb'))

In [None]:
acc = model.score(testX,testY)
print(acc)

# **Submission** **Predictor**

In [None]:
df = pd.read_csv(test_file, index_col=False)
#df.job_no
testJd = '/content/testJd.txt'
saveJD(df,testJd)

In [None]:
f = open(testJd, 'r')
nSentence = f.read().splitlines()
f.close()

In [None]:
testX = getTrainingData(model,tokenizer,nSentence)

In [None]:
print(testX.shape)

In [None]:
#np.savetxt('testX.csv',testX, delimiter=',',fmt='%8.4f')
testX = np.loadtxt('testX.csv',delimiter=',')

In [None]:
model = createModel()
model.load_weights(checkpoint_path)
le = pickle.load(open('/content/catlabelFit.sav', 'rb'))  # le = preprocessing.LabelEncoder()

In [None]:
catPredTemp = model.predict(testX)
catPredTemp = getAllpd(catPredTemp)
#print(catPredTemp[0])
catPred = le.inverse_transform(catPredTemp)

In [None]:
model = pickle.load(open('/content/typModel.sav', 'rb'))
le = pickle.load(open('/content/typlabelFit.sav', 'rb'))

In [None]:
typPredTemp = model.predict(testX).astype(int)
typPred = le.inverse_transform(typPredTemp)

In [None]:
'''dict = {'job_no':df.job_no, 
        'category': catPred, 
        'job_type': typPred}'''

In [None]:
dict = {'job_no':df.job_no,  
        'job_type': typPred,
        'category': catPred}

In [None]:
dfSub = pd.DataFrame(dict)
dfSub.to_csv('/content/submission.csv',index=False)

In [None]:
dfSub.head()