In [2]:
import numpy as np
import pandas as pd 
from tensorflow.keras.layers import Dropout, Dense, GRU, Embedding
from tensorflow.keras.models import Sequential
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn import metrics
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Concatenate
from sklearn.model_selection import train_test_split

In [3]:
def loadData_Tokenizer(X_train, X_valid,MAX_NB_WORDS=75000,MAX_SEQUENCE_LENGTH=60):
    np.random.seed(1)
    text = np.concatenate((X_train, X_valid), axis=0)
    text = np.array(text)
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)
    word_index = tokenizer.word_index
    text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    print('Found %s unique tokens.' % len(word_index))
    indices = np.arange(text.shape[0])
    # np.random.shuffle(indices)
    text = text[indices]
    print(text.shape)
    X_train = text[0:len(X_train), ]
    X_valid = text[len(X_train):, ]
    embeddings_index = {}
    f = open("C:\\Users\\yoges\\MachineLearningProject\\Glove\\glove.6B.50d.txt", encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except:
            pass
        embeddings_index[word] = coefs
    f.close()
    print('Total %s word vectors.' % len(embeddings_index))
    return (X_train, X_valid, word_index,embeddings_index)

In [4]:
def loadData_Tokenizer_test(X_test,MAX_NB_WORDS=75000,MAX_SEQUENCE_LENGTH=60):
    np.random.seed(1)
    text = np.array(X_test)
    tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)
    word_index = tokenizer.word_index
    text = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    print('Found %s unique tokens.' % len(word_index))
    indices = np.arange(text.shape[0])
    # np.random.shuffle(indices)
    text = text[indices]
    print(text.shape)
    X_test = text
    embeddings_index = {}
    f = open("C:\\Users\\yoges\\MachineLearningProject\\Glove\\glove.6B.50d.txt", encoding="utf8")
    for line in f:
        values = line.split()
        word = values[0]
        try:
            coefs = np.asarray(values[1:], dtype='float32')
        except:
            pass
        embeddings_index[word] = coefs
    f.close()
    print('Total %s word vectors.' % len(embeddings_index))
    return (X_test, word_index,embeddings_index)

In [5]:
def Build_Model_RNN_Text(word_index, embeddings_index, nclasses,  MAX_SEQUENCE_LENGTH=60, EMBEDDING_DIM=50, dropout=0.45):
    """
    def buildModel_RNN(word_index, embeddings_index, nclasses,  MAX_SEQUENCE_LENGTH=500, EMBEDDING_DIM=50, dropout=0.5):
    word_index in word index ,
    embeddings_index is embeddings index, look at data_helper.py
    nClasses is number of classes,
    MAX_SEQUENCE_LENGTH is maximum lenght of text sequences
    """
    model = Sequential()
    hidden_layer = 3
    gru_node = 32
    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            if len(embedding_matrix[i]) != len(embedding_vector):
                print("could not broadcast input array from shape", str(len(embedding_matrix[i])),
                      "into shape", str(len(embedding_vector)), " Please make sure your"
                                                                " EMBEDDING_DIM is equal to embedding_vector file ,GloVe,")
                exit(1)
            embedding_matrix[i] = embedding_vector
    model.add(Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True))
    print(gru_node)
    for i in range(0,hidden_layer):
        model.add(GRU(gru_node,return_sequences=True, recurrent_dropout=0.2))
        model.add(Dropout(dropout))
    model.add(GRU(gru_node, recurrent_dropout=0.2))
    model.add(Dropout(dropout))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(dropout))
    model.add(Dense(nclasses, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    return model

In [6]:
import json
labels = pd.read_csv("categories_string.csv")
train_label= pd.read_csv('train_label.csv')
with open('train.json') as f:
   train = json.load(f)
with open('test.json') as h:
   test = json.load(h)

train_df = pd.DataFrame(train)
test_df = pd.DataFrame(test)
dic = labels.to_dict()
dic = dic["0"]
train_label['jobtitle'] = train_label['Category']
train_label = train_label.replace({"jobtitle": dic})
comb_data = pd.merge(train_df,train_label,how = "outer", on = 'Id' )
comb_data

Unnamed: 0,Id,description,gender,Category,jobtitle
0,0,She is also a Ronald D. Asmus Policy Entrepre...,F,19,professor
1,1,He is a member of the AICPA and WICPA. Brent ...,M,9,accountant
2,2,Dr. Aster has held teaching and research posi...,M,19,professor
3,3,He runs a boutique design studio attending cl...,M,24,architect
4,4,"He focuses on cloud security, identity and ac...",M,24,architect
...,...,...,...,...,...
217192,217192,A member of the UWA Cultural Collections Boar...,M,19,professor
217193,217193,Kelly has worked globally leading teams of co...,F,22,psychologist
217194,217194,He's the lead author of a recent study that f...,M,19,professor
217195,217195,She specializes in the theoretical and pedago...,F,19,professor


In [7]:
df_mismatch = comb_data.groupby('description').nunique().sort_values('jobtitle')
df_mismatch = df_mismatch[df_mismatch['jobtitle']>1]
df_mismatch = df_mismatch.index.to_list()

In [8]:
df_mismatch

[' He is an in-network provider for Blue Cross Blue Shield EPO, Blue Cross Blue Shield Bronze, and Blue Cross Blue Shield HMO, as well as other insurance carriers.',
 ' More information on property rights is available at www.mackinac.org. Permission to reprint in whole or in part is hereby granted, provided the author and his affiliations are cited.)',
 ' She was in clinical practice in UAB’s Transfusion Medicine for 7 years prior to joining the School of Health Professions in 2010.Ms. Miller received her undergraduate Bachelor of Science degree in Biology from Judson College (2002) and her Master of Science degree in Clinical Laboratory Science from the University of Alabama at Birmingham (2004). She holds a current certification from American Society for Clinical Pathology (ASCP).',
 ' He obtained his MSc (Dalhousie University) and PhD in Pharmacy (University of British Columbia) and went on to medical school in Toronto. He subsequently trained as a general thoracic and vascular surg

In [9]:
filter_condition = comb_data['description'].isin(df_mismatch)
comb_data = comb_data[~filter_condition]

In [10]:
comb_data.shape

(217175, 5)

In [11]:
shuffled_data= comb_data.reindex(np.random.permutation(comb_data.index))
shuffled_data

Unnamed: 0,Id,description,gender,Category,jobtitle
4295,4295,Her stories have appeared in The Chronicle of...,F,6,journalist
173903,173903,His research interests are in statistical asp...,M,19,professor
132585,132585,"A newcomer to politics, Prince said she was i...",F,14,nurse
29929,29929,He received his PhD in Economics from the Uni...,M,19,professor
24611,24611,His research focuses on educational policy fo...,M,19,professor
...,...,...,...,...,...
168053,168053,She is interested in shaping international kn...,F,3,teacher
143255,143255,Whilst he’s literally putting his hometown on...,M,10,dj
148201,148201,Fleischer began her career in photography as ...,F,20,photographer
81599,81599,Her hospital/clinic affiliations include Nort...,F,11,physician


In [12]:
test_df

Unnamed: 0,Id,description,gender
3,0,She currently works on CNN’s newest primetime...,F
6,1,Lavalette’s photographs have been shown widel...,M
11,2,Along with his academic and professional deve...,M
17,3,She obtained her Ph.D. in Islamic Studies at ...,F
18,4,She studies issues of women and Islam and has...,F
...,...,...,...
271476,54295,"Prior to that, she worked as a Research Staff...",F
271477,54296,The group’s antics began when they switched t...,M
271482,54297,"Formerly, she was the Coordinator for Music E...",F
271485,54298,She started her law practice at Morris Mannin...,F


In [13]:
shuffled_test_data = test_df.reindex(np.random.permutation(test_df.index))
shuffled_test_data 

Unnamed: 0,Id,description,gender
225005,45147,She established a nonprofit human rights orga...,F
19524,3873,"Three years ago, she was approached by her pr...",F
224588,45065,Dr. Sumedh Dawane practices at Dr. Hitesh Swa...,M
129782,26008,His international experience includes various...,M
64394,12756,She graduated with honors in 1990. Having mor...,F
...,...,...,...
231596,46450,He received his M.Sc. degree in Computer Scie...,M
262507,52552,Whether he is producing a conventional painti...,M
11970,2357,He has resolved disputes and crafted deals fo...,M
220585,44294,"He deals mostly in small civil cases, but Jay...",M


In [14]:
X = shuffled_data['description']
y = shuffled_data['Category']

X_test = shuffled_test_data['description']

In [15]:
X_train, X_valid, y_train ,y_valid = train_test_split(X, y,stratify=y, test_size=0.20, random_state = 1)

In [16]:
print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)
print(X_test.shape)

(173740,)
(173740,)
(43435,)
(43435,)
(54300,)


In [17]:
X_train_Glove,X_valid_Glove, word_index,embeddings_index = loadData_Tokenizer(X_train,X_valid)

Found 272164 unique tokens.
(217175, 60)
Total 400000 word vectors.


In [18]:
X_test_Glove = loadData_Tokenizer_test(X_test)

Found 113608 unique tokens.
(54300, 60)
Total 400000 word vectors.


In [19]:
model_RNN = Build_Model_RNN_Text(word_index,embeddings_index, 28)
print(model_RNN.summary())

32
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 60, 50)            13608250  
_________________________________________________________________
gru (GRU)                    (None, 60, 32)            8064      
_________________________________________________________________
dropout (Dropout)            (None, 60, 32)            0         
_________________________________________________________________
gru_1 (GRU)                  (None, 60, 32)            6336      
_________________________________________________________________
dropout_1 (Dropout)          (None, 60, 32)            0         
_________________________________________________________________
gru_2 (GRU)                  (None, 60, 32)            6336      
_________________________________________________________________
dropout_2 (Dropout)          (None, 60, 32)          

In [20]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
checkpoint_path='C:\\Users\\yoges\\MachineLearningProject\\rnn_weights.h5'


keras_callbacks   = [
      EarlyStopping(monitor='val_loss', patience=4),
      ModelCheckpoint(checkpoint_path,save_weights_only=True ,save_best_only=True)
]

In [21]:
import tensorflow as tf

with tf.device('/GPU:0'):
    model_RNN.fit(X_train_Glove, y_train,
                              validation_data=(X_valid_Glove, y_valid),
                              epochs=10,
                              batch_size=8,verbose=2,callbacks=keras_callbacks)


Epoch 1/10
21718/21718 - 14954s - loss: 1.3239 - accuracy: 0.6204 - val_loss: 0.9086 - val_accuracy: 0.7430
Epoch 2/10
21718/21718 - 14740s - loss: 0.9635 - accuracy: 0.7309 - val_loss: 0.8756 - val_accuracy: 0.7591
Epoch 3/10
21718/21718 - 14684s - loss: 0.8507 - accuracy: 0.7639 - val_loss: 0.8490 - val_accuracy: 0.7697
Epoch 4/10
21718/21718 - 14645s - loss: 0.7640 - accuracy: 0.7876 - val_loss: 0.8850 - val_accuracy: 0.7590
Epoch 5/10
21718/21718 - 14818s - loss: 0.6907 - accuracy: 0.8089 - val_loss: 0.9371 - val_accuracy: 0.7635
Epoch 6/10
21718/21718 - 16588s - loss: 0.6309 - accuracy: 0.8261 - val_loss: 0.9699 - val_accuracy: 0.7590
Epoch 7/10
21718/21718 - 14705s - loss: 0.5781 - accuracy: 0.8417 - val_loss: 1.0864 - val_accuracy: 0.7538


In [22]:
predicted = model_RNN.predict(X_valid_Glove)


In [23]:
predicted = np.argmax(predicted, axis=1)

In [24]:

print(metrics.classification_report(y_valid, predicted))

              precision    recall  f1-score   support

           0       0.31      0.53      0.39       299
           1       0.80      0.66      0.72       823
           2       0.63      0.59      0.61       189
           3       0.48      0.58      0.53      1829
           4       0.50      0.57      0.53       161
           5       0.66      0.75      0.70       924
           6       0.64      0.66      0.65      2459
           7       0.52      0.38      0.44       172
           8       0.75      0.70      0.73      1323
           9       0.65      0.70      0.68       624
          10       0.55      0.58      0.56       166
          11       0.69      0.68      0.68      2321
          12       0.54      0.71      0.61       328
          13       0.63      0.61      0.62       812
          14       0.88      0.77      0.82      2524
          15       0.64      0.64      0.64       858
          16       0.92      0.91      0.92      1089
          17       0.61    

In [25]:
prediction = model_RNN.predict(X_test_Glove)

ValueError: Failed to find data adapter that can handle input: (<class 'tuple'> containing values of types {"<class 'numpy.ndarray'>", '(<class \'dict\'> containing {"<class \'str\'>"} keys and {"<class \'int\'>"} values)', '(<class \'dict\'> containing {"<class \'str\'>"} keys and {"<class \'numpy.ndarray\'>"} values)'}), <class 'NoneType'>

In [None]:
prediction = np.argmax(prediction, axis = 1)