In [5]:
'''!pip install tensorflow==2.0
!pip install tensorflow_hub
!pip install bert-for-tf2
!pip install sentencepiece'''

'!pip install tensorflow==2.0\n!pip install tensorflow_hub\n!pip install bert-for-tf2\n!pip install sentencepiece'

In [6]:
import tensorflow as tf
import tensorflow_hub as hub
print("TF version: ", tf.__version__)
print("Hub version: ", hub.__version__)

TF version:  2.0.0
Hub version:  0.8.0


In [0]:
import bert
from tensorflow.keras.models import Model       # Keras is the new high level API for TensorFlow
import math
import numpy as np
import pandas as pd
import pickle

In [0]:
train_file = "/content/trainNew.csv"
test_file = '/content/test.csv'
jdFile = '/content/trainJD.txt'
max_seq_length = 512

In [0]:
'''import zipfile
import os
for file_name in os.listdir('/content/'):
  if file_name.endswith('.zip'):
    with zipfile.ZipFile(file_name,'r') as zip_dir:
      zip_dir.extractall(path='/content/')'''

In [0]:
'''import zipfile
import os
for file_name in os.listdir('/content/drive/My Drive/temp/'):
  if file_name.endswith('.zip'):
    with zipfile.ZipFile('/content/drive/My Drive/temp/'+file_name,'r') as zip_dir:
      zip_dir.extractall(path='/content/')'''

In [0]:
def getBERTModel():
    input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,name="input_word_ids")
    input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,name="input_mask")
    segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,name="segment_ids")

    bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",trainable=False)
    #bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1",trainable=False)
    
    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output])

    vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
    do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
    FullTokenizer = bert.bert_tokenization.FullTokenizer
    tokenizer = FullTokenizer(vocab_file, do_lower_case)

    return {'model':model,'tokenizer':tokenizer}

In [0]:
def get_masks(tokens, max_seq_length):
    #print('len(tokens),max_seq_length)
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [0]:
def getEmbeddings(model,tokenizer,sentence): 
    stokens = tokenizer.tokenize(sentence)

    if len(stokens) > (max_seq_length - 2):
      stokens = stokens[:max_seq_length-2]

    stokens = ["[CLS]"] + stokens + ["[SEP]"]

    input_ids = get_ids(stokens, tokenizer, max_seq_length)
    #print(len(input_ids))
    input_masks = get_masks(stokens, max_seq_length)
    input_segments = get_segments(stokens, max_seq_length)

    '''print(input_masks)
    print(input_segments)'''

    pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])
    '''print('see')
    print(all_embs.shape)
    print(pool_embs.shape)'''
    # pool_ebmbs is an embeding of CLS token
    # all_embs contains embeding for words of input sentence.
    return pool_embs

In [0]:
def getTrainingData(model,tokenizer,nSentences):
    trainX = np.asarray([])
    #trainY = np.asarray([])
    n = len(nSentences)
    for i in range(n): 
        if i%500 == 0:
          print( 'Processing ',i,' out of ',n)

        senLen = len(nSentences[i].split())
        embs = getEmbeddings(model,tokenizer,nSentences[i])
        '''print(embs.shape)
        x = embs'''
        if trainX.shape[0] == 0:
            trainX = embs
            #trainY = y
        else:
            trainX = np.concatenate((trainX, embs), axis=0)
            #trainY = np.concatenate((trainY, y), axis=0)
    return trainX

In [0]:
def saveJD(df,fileName):
    JDs = list(df.job_description)
    print(len(JDs))
    with open(fileName, 'w',encoding="utf-8") as f:
        for strEle in JDs:
            ele = strEle.replace("\r", "")
            ele =  ele.replace("\n", "")
            ele = ele.strip().lower()
            f.write(ele)
            f.write('\n')

# **Embedding Generation**

In [16]:
df = pd.read_csv(train_file, index_col=False)
saveJD(df,jdFile)

44325


In [0]:
outModel = getBERTModel()
model = outModel['model']
tokenizer = outModel['tokenizer']

In [0]:
#nSentence = ["short cuts make long delays","short cuts make long delays"]

In [0]:
f = open(jdFile, 'r')
nSentence = f.read().splitlines()
f.close()

In [0]:
trainX = getTrainingData(model,tokenizer,nSentence)

In [0]:
print(trainX.shape)

(47532, 768)


In [0]:
np.savetxt('trainX.csv',trainX, delimiter=',',fmt='%8.4f')
#trainX = np.loadtxt('trainX.csv',delimiter=',')

# **Dataset Generation**

In [0]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df.category)
pickle.dump(le, open('/content/catlabelFit.sav', 'wb'))

In [19]:
cat = le.transform(df.category)
print(cat.shape)
saveData = np.concatenate((trainX,np.asarray([cat]).T),axis=1)
print(saveData.shape)
np.savetxt('/content/catTrain.csv',saveData, delimiter=',',fmt='%8.4f')

(44325,)
(44325, 769)


In [0]:
le.fit(df.job_type)
pickle.dump(le, open('/content/typlabelFit.sav', 'wb'))

In [21]:
typ = le.fit_transform(df.job_type)
print(typ.shape)
saveData = np.concatenate((trainX,np.asarray([typ]).T),axis=1)
print(saveData.shape)
np.savetxt('/content/typTrain.csv',saveData, delimiter=',',fmt='%8.4f')

(44325,)
(44325, 769)


# **Training**

In [0]:
loadData = np.loadtxt('/content/catTrain.csv',delimiter=',')
labels = loadData[:,-1]
dataset = loadData[:,:-1]

In [23]:
print(dataset.shape)
print(labels.shape)

(44325, 768)
(44325,)


In [0]:
from sklearn.model_selection import train_test_split
trainX,testX, trainY, testY = train_test_split(dataset, labels,train_size = 0.8, test_size=0.2, random_state=42,stratify = labels)

In [0]:
trainX = dataset
trainY = labels

In [0]:
svm_c = 0.001
svm_gamma = 1
svm_kernal = 'linear'

estimators = 1000

In [0]:
'''from sklearn import svm
#model = svm.SVC(C=10,kernel='rbf',gamma=0.01)
#model = svm.SVC(C=svm_c,kernel=svm_kernal,gamma=svm_gamma)
model = svm.SVC(C=svm_c,kernel=svm_kernal)'''

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=estimators)

In [28]:
print("training started...")
model.fit(trainX,trainY)
pickle.dump(model, open('/content/catModel.sav', 'wb'))

training started...


In [31]:
acc = model.score(testX,testY)
print(acc)

0.8544839255499154


In [0]:
loadData = np.loadtxt('/content/typTrain.csv',delimiter=',')
labels = loadData[:,-1]
dataset = loadData[:,:-1]

In [0]:
print(dataset.shape)
print(labels.shape)

(47532, 768)
(47532,)


In [0]:
trainX,testX, trainY, testY = train_test_split(dataset, labels,train_size = 0.8, test_size=0.2, random_state=42,stratify = labels)

In [0]:
svm_c = 0.01
svm_gamma = 10
svm_kernal = 'linear'

estimators = 1000

In [0]:
#model = svm.SVC(C=10,kernel='rbf',gamma=0.01)
#model = svm.SVC(C=svm_c,kernel=svm_kernal,gamma=svm_gamma,class_weight='balanced')
#model = svm.SVC(C=svm_c,kernel=svm_kernal,class_weight='balanced')

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=estimators)

In [0]:
print("training started...")
model.fit(trainX,trainY)
pickle.dump(model, open('/content/typModel.sav', 'wb'))

training started...


In [0]:
model = pickle.load(open('/content/typModel.sav', 'rb'))

In [0]:
acc = model.score(testX,testY)
print(acc)

0.9951614599768591


# **Submission** **Predictor**

In [35]:
df = pd.read_csv(test_file, index_col=False)
#df.job_no
testJd = '/content/testJd.txt'
saveJD(df,testJd)

11771


In [0]:
f = open(testJd, 'r')
nSentence = f.read().splitlines()
f.close()

In [0]:
testX = getTrainingData(model,tokenizer,nSentence)

In [0]:
print(testX.shape)

(11771, 768)


In [0]:
np.savetxt('testX.csv',testX, delimiter=',',fmt='%8.4f')

#testX = np.loadtxt('testX.csv',delimiter=',')

In [0]:
model = pickle.load(open('/content/catModel.sav', 'rb'))
le = pickle.load(open('/content/catlabelFit.sav', 'rb'))  # le = preprocessing.LabelEncoder()

In [0]:
catPredTemp = model.predict(testX).astype(int)
catPred = le.inverse_transform(catPredTemp)

In [0]:
model = pickle.load(open('/content/typModel.sav', 'rb'))
le = pickle.load(open('/content/typlabelFit.sav', 'rb'))

In [0]:
typPredTemp = model.predict(testX).astype(int)
typPred = le.inverse_transform(typPredTemp)

In [0]:
'''dict = {'job_no':df.job_no, 
        'category': catPred, 
        'job_type': typPred}'''

"dict = {'job_no':df.job_no, \n        'category': catPred, \n        'job_type': typPred}"

In [0]:
dict = {'job_no':df.job_no,  
        'job_type': typPred,
        'category': catPred}

In [0]:
dfSub = pd.DataFrame(dict)
dfSub.to_csv('/content/submission.csv',index=False) 

In [43]:
dfSub.head()

Unnamed: 0,job_no,job_type,category
0,Id-1,Permanent,Data Management and Statistics
1,Id-2,Permanent,"Pharmaceutical, Healthcare and Medical Sales"
2,Id-4,Permanent,Data Management and Statistics
3,Id-6,Permanent,Pharmaceutical Marketing
4,Id-7,Permanent,Data Management and Statistics
