In [9]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')


In [11]:
department = pd.read_csv("/home/tinkudhull/Downloads/spotmentordev-machine-learning-assessment-aeadf808e7ab/data/document_departments.csv")

In [12]:
import os, json

# this finds our json files
path_to_json = '/home/tinkudhull/Downloads/spotmentordev-machine-learning-assessment-aeadf808e7ab/data/docs/'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

# here I define my pandas Dataframe with the columns I want to get from the json
jsons_data = pd.DataFrame(columns=['_id', 'jd_information'])

# we need both the json and an index number so use enumerate()
for index, js in enumerate(json_files):
    with open(os.path.join(path_to_json, js)) as json_file:
        json_text = pd.read_json(json_file)
        #print(json_text.head())
        
        
        temp = json_text[['_id', 'jd_information']].loc[["description"]]
        jsons_data = jsons_data.append(temp)
        if (( index % 100 ) == 0):
            print(index)



0
100
200
300
400
500
600
700
800
900
1000
1100


In [13]:
jsons_data.head()

Unnamed: 0,_id,jd_information
description,8322449,TFG VACATIONS INDIA is a travel &amp; tourism ...
description,8158423,Dear Candidate&nbsp;Hope you are doing well&nb...
description,8336106,
description,8366386,
description,8008828,


In [14]:
department.columns = ['_id', 'Department']

In [15]:
jsons_data.reset_index(drop=True, inplace=True)

In [16]:
jsons_data['_id'] = jsons_data['_id'].astype(str).astype(int)

In [17]:
data = pd.merge(left = jsons_data, right=department, on = '_id', how = 'left')

In [18]:
data.replace(r'^\s*$', np.nan, regex=True, inplace = True)

In [19]:
data.dropna(inplace=True)

In [20]:
data.reset_index(drop=True, inplace=True)

In [21]:
data['ncount'] = data.groupby(['Department'])['Department'].transform('count')

In [22]:
data = data[data.ncount > 1]

In [23]:
data.drop(['ncount'], inplace=True, axis = 1)

In [25]:
train = data.copy()

In [26]:
train.shape

(740, 3)

In [27]:
train.head()

Unnamed: 0,_id,jd_information,Department
0,8322449,TFG VACATIONS INDIA is a travel &amp; tourism ...,Digital Marketing
1,8158423,Dear Candidate&nbsp;Hope you are doing well&nb...,Administration
2,7987903,Ready for travelling in Tricity Area with team...,Marketing
3,8029293,We have urgent opening for Tour Manager for a ...,Back office ticketing
4,7960945,Examine accounting records and prepare financi...,Finance


In [29]:
lbl_enc = preprocessing.LabelEncoder()
y = lbl_enc.fit_transform(train.Department.values)

In [31]:
xtrain, xvalid, ytrain, yvalid = train_test_split(train.jd_information.values, y, 
                                                  stratify=y, 
                                                  random_state=12, 
                                                  test_size=0.2, shuffle=True)

In [32]:
def multiclass_logloss(actual, predicted, eps=1e-15):
    # Convert 'actual' to a binary array if it's not already:
    if len(actual.shape) == 1:
        actual2 = np.zeros((actual.shape[0], predicted.shape[1]))
        for i, val in enumerate(actual):
            actual2[i, val] = 1
        actual = actual2

    clip = np.clip(predicted, eps, 1 - eps)
    rows = actual.shape[0]
    vsota = np.sum(actual * np.log(clip))
    return -1.0 / rows * vsota

In [33]:
# Always start with these features. They work (almost) everytime!
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)


In [34]:
# Fitting a simple Logistic Regression on TFIDF
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 1.532 


In [35]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)

In [36]:
# Fitting a simple Logistic Regression on Counts
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 1.322 


In [37]:
# Fitting a simple Naive Bayes on TFIDF
clf = MultinomialNB()
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

logloss: 1.516 


In [49]:
embeddings_index = {}
f = open('glove.42B.300d.txt')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))


1917495it [04:17, 7444.27it/s]

Found 1917495 word vectors.





In [55]:
# this function creates a normalized vector for the whole sentence
def sent2vec(s):
    words = str(s).lower()#.decode('utf-8')
    words = word_tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [56]:
xtrain = str(xtrain)
xvalid = str(xvalid)

In [58]:
# create sentence vectors using the above function for training and validation set
xtrain_glove = [sent2vec(x) for x in tqdm(xtrain)]
xvalid_glove = [sent2vec(x) for x in tqdm(xvalid)]




  0%|          | 0/424221 [00:00<?, ?it/s][A[A[A


  0%|          | 442/424221 [00:00<01:43, 4094.64it/s][A[A[A


  0%|          | 847/424221 [00:00<01:44, 4069.50it/s][A[A[A


  0%|          | 1176/424221 [00:00<01:50, 3816.70it/s][A[A[A


  0%|          | 1649/424221 [00:00<01:46, 3975.86it/s][A[A[A


  1%|          | 2172/424221 [00:00<01:39, 4226.95it/s][A[A[A


  1%|          | 2646/424221 [00:00<01:38, 4269.92it/s][A[A[A


  1%|          | 3137/424221 [00:00<01:37, 4337.66it/s][A[A[A


  1%|          | 3555/424221 [00:00<01:39, 4223.77it/s][A[A[A


  1%|          | 4017/424221 [00:00<01:38, 4264.71it/s][A[A[A


  1%|          | 4435/424221 [00:01<01:40, 4181.34it/s][A[A[A


  1%|          | 5021/424221 [00:01<01:36, 4325.37it/s][A[A[A


  1%|▏         | 5585/424221 [00:01<01:34, 4427.52it/s][A[A[A


  1%|▏         | 6072/424221 [00:01<01:34, 4423.93it/s][A[A[A


  2%|▏         | 6624/424221 [00:01<01:32, 4500.10it/s][A[A[A


  2%|▏ 

 16%|█▌        | 66541/424221 [00:13<01:10, 5064.92it/s][A[A[A


 16%|█▌        | 67089/424221 [00:13<01:10, 5062.21it/s][A[A[A


 16%|█▌        | 67618/424221 [00:13<01:10, 5063.02it/s][A[A[A


 16%|█▌        | 68142/424221 [00:13<01:10, 5063.69it/s][A[A[A


 16%|█▌        | 68662/424221 [00:13<01:10, 5062.82it/s][A[A[A


 16%|█▋        | 69177/424221 [00:13<01:10, 5063.61it/s][A[A[A


 16%|█▋        | 69818/424221 [00:13<01:09, 5074.70it/s][A[A[A


 17%|█▋        | 70445/424221 [00:13<01:09, 5083.30it/s][A[A[A


 17%|█▋        | 71084/424221 [00:13<01:09, 5092.67it/s][A[A[A


 17%|█▋        | 71678/424221 [00:14<01:09, 5096.97it/s][A[A[A


 17%|█▋        | 72280/424221 [00:14<01:08, 5103.56it/s][A[A[A


 17%|█▋        | 72871/424221 [00:14<01:08, 5106.52it/s][A[A[A


 17%|█▋        | 73450/424221 [00:14<01:08, 5110.00it/s][A[A[A


 17%|█▋        | 74023/424221 [00:14<01:08, 5108.59it/s][A[A[A


 18%|█▊        | 74581/424221 [00:14<01:08, 5112

 32%|███▏      | 135765/424221 [00:26<00:55, 5182.06it/s][A[A[A


 32%|███▏      | 136283/424221 [00:26<00:55, 5176.46it/s][A[A[A


 32%|███▏      | 136765/424221 [00:26<00:55, 5172.36it/s][A[A[A


 32%|███▏      | 137402/424221 [00:26<00:55, 5177.26it/s][A[A[A


 33%|███▎      | 138020/424221 [00:26<00:55, 5181.50it/s][A[A[A


 33%|███▎      | 138667/424221 [00:26<00:55, 5185.63it/s][A[A[A


 33%|███▎      | 139239/424221 [00:26<00:55, 5181.43it/s][A[A[A


 33%|███▎      | 139801/424221 [00:26<00:54, 5183.44it/s][A[A[A


 33%|███▎      | 140342/424221 [00:27<00:54, 5184.19it/s][A[A[A


 33%|███▎      | 140946/424221 [00:27<00:54, 5187.19it/s][A[A[A


 33%|███▎      | 141673/424221 [00:27<00:54, 5194.76it/s][A[A[A


 34%|███▎      | 142453/424221 [00:27<00:54, 5204.35it/s][A[A[A


 34%|███▍      | 143245/424221 [00:27<00:53, 5214.32it/s][A[A[A


 34%|███▍      | 144039/424221 [00:27<00:53, 5224.19it/s][A[A[A


 34%|███▍      | 144929/424221 [00

 49%|████▉     | 209523/424221 [00:39<00:40, 5350.43it/s][A[A[A


 50%|████▉     | 210136/424221 [00:39<00:39, 5352.33it/s][A[A[A


 50%|████▉     | 210777/424221 [00:39<00:39, 5355.10it/s][A[A[A


 50%|████▉     | 211367/424221 [00:39<00:39, 5354.88it/s][A[A[A


 50%|████▉     | 211939/424221 [00:39<00:39, 5354.17it/s][A[A[A


 50%|█████     | 212494/424221 [00:39<00:39, 5353.48it/s][A[A[A


 50%|█████     | 213035/424221 [00:39<00:39, 5353.12it/s][A[A[A


 50%|█████     | 213570/424221 [00:39<00:39, 5352.94it/s][A[A[A


 50%|█████     | 214109/424221 [00:39<00:39, 5353.01it/s][A[A[A


 51%|█████     | 214659/424221 [00:40<00:39, 5353.37it/s][A[A[A


 51%|█████     | 215198/424221 [00:40<00:39, 5352.39it/s][A[A[A


 51%|█████     | 215863/424221 [00:40<00:38, 5355.73it/s][A[A[A


 51%|█████     | 216433/424221 [00:40<00:38, 5356.51it/s][A[A[A


 51%|█████     | 217087/424221 [00:40<00:38, 5359.39it/s][A[A[A


 51%|█████▏    | 217681/424221 [00

 65%|██████▍   | 275443/424221 [00:52<00:28, 5257.76it/s][A[A[A


 65%|██████▌   | 276284/424221 [00:52<00:28, 5263.19it/s][A[A[A


 65%|██████▌   | 277007/424221 [00:52<00:27, 5267.07it/s][A[A[A


 65%|██████▌   | 277722/424221 [00:52<00:27, 5253.94it/s][A[A[A


 66%|██████▌   | 278303/424221 [00:52<00:27, 5251.06it/s][A[A[A


 66%|██████▌   | 278835/424221 [00:53<00:27, 5246.64it/s][A[A[A


 66%|██████▌   | 279338/424221 [00:53<00:27, 5246.55it/s][A[A[A


 66%|██████▌   | 279831/424221 [00:53<00:27, 5241.05it/s][A[A[A


 66%|██████▌   | 280267/424221 [00:53<00:27, 5232.76it/s][A[A[A


 66%|██████▌   | 280684/424221 [00:53<00:27, 5230.94it/s][A[A[A


 66%|██████▋   | 281076/424221 [00:53<00:27, 5226.19it/s][A[A[A


 66%|██████▋   | 281494/424221 [00:53<00:27, 5224.27it/s][A[A[A


 66%|██████▋   | 281876/424221 [00:53<00:27, 5221.14it/s][A[A[A


 67%|██████▋   | 282288/424221 [00:54<00:27, 5219.16it/s][A[A[A


 67%|██████▋   | 282675/424221 [00

 80%|███████▉  | 338258/424221 [01:05<00:16, 5142.31it/s][A[A[A


 80%|███████▉  | 338833/424221 [01:05<00:16, 5143.26it/s][A[A[A


 80%|████████  | 339420/424221 [01:05<00:16, 5144.33it/s][A[A[A


 80%|████████  | 339987/424221 [01:06<00:16, 5145.00it/s][A[A[A


 80%|████████  | 340591/424221 [01:06<00:16, 5146.32it/s][A[A[A


 80%|████████  | 341288/424221 [01:06<00:16, 5149.00it/s][A[A[A


 81%|████████  | 341900/424221 [01:06<00:15, 5149.45it/s][A[A[A


 81%|████████  | 342568/424221 [01:06<00:15, 5151.74it/s][A[A[A


 81%|████████  | 343306/424221 [01:06<00:15, 5155.08it/s][A[A[A


 81%|████████  | 344111/424221 [01:06<00:15, 5159.42it/s][A[A[A


 81%|████████▏ | 344809/424221 [01:06<00:15, 5161.99it/s][A[A[A


 81%|████████▏ | 345507/424221 [01:06<00:15, 5164.27it/s][A[A[A


 82%|████████▏ | 346265/424221 [01:07<00:15, 5167.71it/s][A[A[A


 82%|████████▏ | 347106/424221 [01:07<00:14, 5172.70it/s][A[A[A


 82%|████████▏ | 347852/424221 [01

 96%|█████████▌| 405232/424221 [01:18<00:03, 5146.22it/s][A[A[A


 96%|█████████▌| 405705/424221 [01:18<00:03, 5145.30it/s][A[A[A


 96%|█████████▌| 406257/424221 [01:18<00:03, 5145.80it/s][A[A[A


 96%|█████████▌| 407109/424221 [01:19<00:03, 5150.23it/s][A[A[A


 96%|█████████▌| 407709/424221 [01:19<00:03, 5150.07it/s][A[A[A


 96%|█████████▌| 408284/424221 [01:19<00:03, 5147.36it/s][A[A[A


 96%|█████████▋| 408800/424221 [01:19<00:02, 5145.75it/s][A[A[A


 96%|█████████▋| 409361/424221 [01:19<00:02, 5146.27it/s][A[A[A


 97%|█████████▋| 409881/424221 [01:19<00:02, 5146.20it/s][A[A[A


 97%|█████████▋| 410392/424221 [01:19<00:02, 5145.95it/s][A[A[A


 97%|█████████▋| 411232/424221 [01:19<00:02, 5150.14it/s][A[A[A


 97%|█████████▋| 411928/424221 [01:19<00:02, 5152.40it/s][A[A[A


 97%|█████████▋| 412753/424221 [01:20<00:02, 5156.29it/s][A[A[A


 97%|█████████▋| 413472/424221 [01:20<00:02, 5158.80it/s][A[A[A


 98%|█████████▊| 414264/424221 [01

 47%|████▋     | 53416/112596 [00:10<00:11, 5036.59it/s][A[A[A


 48%|████▊     | 54141/112596 [00:10<00:11, 5056.54it/s][A[A[A


 49%|████▊     | 54757/112596 [00:10<00:11, 5049.10it/s][A[A[A


 49%|████▉     | 55467/112596 [00:10<00:11, 5067.78it/s][A[A[A


 50%|████▉     | 56074/112596 [00:11<00:11, 5074.52it/s][A[A[A


 50%|█████     | 56676/112596 [00:11<00:11, 5077.73it/s][A[A[A


 51%|█████     | 57268/112596 [00:11<00:10, 5078.05it/s][A[A[A


 51%|█████▏    | 57829/112596 [00:11<00:10, 5082.12it/s][A[A[A


 52%|█████▏    | 58388/112596 [00:11<00:10, 5077.68it/s][A[A[A


 52%|█████▏    | 58920/112596 [00:11<00:10, 5075.13it/s][A[A[A


 53%|█████▎    | 59462/112596 [00:11<00:10, 5069.45it/s][A[A[A


 53%|█████▎    | 60068/112596 [00:11<00:10, 5077.65it/s][A[A[A


 54%|█████▍    | 60775/112596 [00:11<00:10, 5093.64it/s][A[A[A


 55%|█████▍    | 61435/112596 [00:12<00:10, 5105.85it/s][A[A[A


 55%|█████▌    | 62039/112596 [00:12<00:09, 5109

In [59]:
xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)

In [60]:
# scale the data before any neural net:
scl = preprocessing.StandardScaler()
xtrain_glove_scl = scl.fit_transform(xtrain_glove)
xvalid_glove_scl = scl.transform(xvalid_glove)


In [61]:
# we need to binarize the labels for the neural net
ytrain_enc = np_utils.to_categorical(ytrain)
yvalid_enc = np_utils.to_categorical(yvalid)

In [62]:
# create a simple 3 layer sequential neural net
model = Sequential()

model.add(Dense(300, input_dim=300, activation='relu'))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(300, activation='relu'))
model.add(Dropout(0.3))
model.add(BatchNormalization())

model.add(Dense(3))
model.add(Activation('softmax'))

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
model.fit(xtrain_glove_scl, y=ytrain_enc, batch_size=64, 
          epochs=5, verbose=1, 
          validation_data=(xvalid_glove_scl, yvalid_enc))