In [1]:
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.feature_extraction import text
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import KFold
from sklearn.metrics.pairwise import cosine_similarity

# TF-IDF (TfidfVectorizer) -> mean vectors for each class -> cos distance (cosine_similarity) like similarity metric


In [2]:
df = pd.read_csv('/Users/vivla/Downloads/N.Rich_Testing_Assignment/occup_df.csv')
test_df = pd.read_csv('/Users/vivla/Downloads/N.Rich_Testing_Assignment/test_df.csv')
train_df = pd.read_csv('/Users/vivla/Downloads/N.Rich_Testing_Assignment/train_df.csv')

codes = sorted(train_df['Code'].unique())

Look at distribution and see disbalace of classes

In [3]:
train_df['Code'].value_counts()

29-1141.00    1186
15-1142.00    1082
15-1132.00    1080
15-1122.00    1051
15-1133.00     985
11-2021.00     778
13-1111.00     690
33-3021.06     590
11-1021.00     589
11-2022.00     477
15-1121.00     438
41-2031.00     434
11-3031.02     432
43-4051.00     371
15-1151.00     355
49-9071.00     314
13-2051.00     287
31-1014.00     283
53-3032.00     248
49-3023.02     232
15-1134.00     196
Name: Code, dtype: int64

In [4]:
# delete some word from ENGLISH_STOP_WORDS like 'own','it','top','co'
my_stop_words = list(text.ENGLISH_STOP_WORDS)
for i in ['own','it','top','co']:
    my_stop_words.remove(i)

# vectorize strings
dictForCodes = {}
vectorizer = TfidfVectorizer(min_df=2,
                             norm ='l2',
                             use_idf=True,
                             sublinear_tf = False,
                             stop_words = my_stop_words,)

vector = vectorizer.fit_transform(train_df.Title)

# count mean vectors for each class
for code in codes:
    size = len(test_df[train_df['Code'] == code].index)
    group_vectors = []
    for i in test_df[train_df['Code'] == code].index:
        group_vectors.append(vector.toarray()[i])
    dictForCodes[code] = sum(group_vectors) / size
    


  size = len(test_df[train_df['Code'] == code].index)
  for i in test_df[train_df['Code'] == code].index:


In [5]:
# function for vectorize input string, count cos_distanse with each class and take class with maximum cos_distanse
def checker_class (string):
    max_cos_metric = 0
    max_code = ''
    for code, vector in dictForCodes.items():
        cos_metric = cosine_similarity(vectorizer.transform([string]).toarray(), [dictForCodes[code]])
        if cos_metric > max_cos_metric:
            max_cos_metric = cos_metric
            max_code = code

    return max_code

# add result to dataframe and compare with target
test_df['result']=''
test_df['diff'] = ''
for i in test_df.index:
    test_df.loc[i,'result'] = checker_class(test_df['Title'][i])
    if test_df['result'][i] == test_df['Code'][i]:
        test_df.loc[i,'diff'] = 1
    else:
        test_df.loc[i,'diff'] = 0

In [6]:
# count results
res = len(test_df[test_df['diff'] == 1])/len(test_df)
print(f'Result: {res}')

test_df.groupby('diff').count()['result']



Result: 0.40628099173553717


diff
0    1796
1    1229
Name: result, dtype: int64

In [7]:
# top 5 words from each class
dict_for_top_words = {}
for code in dictForCodes.keys():
    top_five = np.argsort(dictForCodes[code])[::-1]            
    top_word_list = []
    for index in top_five[0:5]:
        top_word_list.append(vectorizer.get_feature_names()[index])
    
    dict_for_top_words[code] = top_word_list

dict_for_top_words

{'11-1021.00': ['manager', 'operations', 'supervisor', 'store', 'retail'],
 '11-2021.00': ['manager', 'marketing', 'senior', 'sales', 'specialist'],
 '11-2022.00': ['sales', 'manager', 'account', 'director', 'representative'],
 '11-3031.02': ['senior', 'manager', 'tax', 'accountant', 'financial'],
 '13-1111.00': ['analyst', 'consultant', 'business', 'manager', 'senior'],
 '13-2051.00': ['analyst', 'financial', 'finance', 'manager', 'budget'],
 '15-1121.00': ['analyst', 'senior', 'program', 'specialist', 'business'],
 '15-1122.00': ['security', 'engineer', 'analyst', 'information', 'cyber'],
 '15-1132.00': ['engineer', 'software', 'senior', 'developer', 'systems'],
 '15-1133.00': ['engineer', 'systems', 'software', 'senior', 'analyst'],
 '15-1134.00': ['developer', 'senior', 'sharepoint', 'software', 'engineer'],
 '15-1142.00': ['administrator', 'engineer', 'network', 'systems', 'sr'],
 '15-1151.00': ['support', 'specialist', 'technician', 'desk', 'engineer'],
 '29-1141.00': ['nurse', '

# TF-IDF (TfidfVectorizer) ->  Gradient boosting for classification (Catboost by Yandex)
https://catboost.ai/

In [8]:
!pip install CatBoost



In [9]:
from catboost import CatBoostClassifier, Pool, metrics, cv
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

test_df = pd.read_csv('/Users/vivla/Downloads/N.Rich_Testing_Assignment/test_df.csv')
train_df = pd.read_csv('/Users/vivla/Downloads/N.Rich_Testing_Assignment/train_df.csv')


In [10]:
codes = train_df.Code.unique()
dictCodes = {}
number = 0
for code in codes:
    dictCodes[code] = number
    number += 1

def label(x):
    return dictCodes[x]
    
    

train_df['label'] = train_df['Code'].apply(lambda x: label(x))
test_df['label'] = test_df['Code'].apply(lambda x: label(x))
train_df.head(3)

Unnamed: 0,Title,Code,label
0,senior program analyst navy strike fighter sen...,15-1121.00,0
1,senior intelligence analyst iii job,33-3021.06,1
2,retail wireless sales consultant part,41-2031.00,2


In [11]:
# use cross validation with 5 Folds. Each iteration has own model, predict by each model and vote for predict class

X = vector.toarray() 
y = train_df['label']

kf = KFold(n_splits=5, shuffle=True)
models = []

for i, (train_index, valid_index) in enumerate(kf.split(X)):
    X_train, y_train = X[train_index], y[train_index]
    X_valid, y_valid = X[valid_index], y[valid_index]

    model = CatBoostClassifier(l2_leaf_reg = 4, 
                               iterations=250, 
                               learning_rate=0.6,
                               eval_metric='Accuracy', 
                               early_stopping_rounds=50)
    
    model.fit(X_train, y_train, eval_set=(X_valid, y_valid))
    models.append(model)




0:	learn: 0.1992147	test: 0.2020661	best: 0.2020661 (0)	total: 216ms	remaining: 53.7s
1:	learn: 0.2461252	test: 0.2500000	best: 0.2500000 (1)	total: 346ms	remaining: 42.9s
2:	learn: 0.2716470	test: 0.2764463	best: 0.2764463 (2)	total: 476ms	remaining: 39.2s
3:	learn: 0.2852862	test: 0.2834711	best: 0.2834711 (3)	total: 607ms	remaining: 37.3s
4:	learn: 0.3007853	test: 0.3012397	best: 0.3012397 (4)	total: 739ms	remaining: 36.2s
5:	learn: 0.3115313	test: 0.3128099	best: 0.3128099 (5)	total: 870ms	remaining: 35.4s
6:	learn: 0.3157677	test: 0.3161157	best: 0.3161157 (6)	total: 1s	remaining: 34.8s
7:	learn: 0.3331267	test: 0.3305785	best: 0.3305785 (7)	total: 1.14s	remaining: 34.6s
8:	learn: 0.3429428	test: 0.3413223	best: 0.3413223 (8)	total: 1.29s	remaining: 34.7s
9:	learn: 0.3482124	test: 0.3433884	best: 0.3433884 (9)	total: 1.46s	remaining: 35s
10:	learn: 0.3538954	test: 0.3462810	best: 0.3462810 (10)	total: 1.61s	remaining: 35s
11:	learn: 0.3609217	test: 0.3574380	best: 0.3574380 (11)	t

94:	learn: 0.4586691	test: 0.4272727	best: 0.4276860 (90)	total: 12.7s	remaining: 20.8s
95:	learn: 0.4586691	test: 0.4272727	best: 0.4276860 (90)	total: 12.9s	remaining: 20.6s
96:	learn: 0.4588758	test: 0.4272727	best: 0.4276860 (90)	total: 13s	remaining: 20.5s
97:	learn: 0.4597024	test: 0.4272727	best: 0.4276860 (90)	total: 13.1s	remaining: 20.4s
98:	learn: 0.4595991	test: 0.4272727	best: 0.4276860 (90)	total: 13.3s	remaining: 20.2s
99:	learn: 0.4601157	test: 0.4285124	best: 0.4285124 (99)	total: 13.4s	remaining: 20.1s
100:	learn: 0.4602191	test: 0.4276860	best: 0.4285124 (99)	total: 13.5s	remaining: 20s
101:	learn: 0.4603224	test: 0.4276860	best: 0.4285124 (99)	total: 13.7s	remaining: 19.8s
102:	learn: 0.4608390	test: 0.4293388	best: 0.4293388 (102)	total: 13.8s	remaining: 19.7s
103:	learn: 0.4613557	test: 0.4289256	best: 0.4293388 (102)	total: 13.9s	remaining: 19.6s
104:	learn: 0.4612523	test: 0.4280992	best: 0.4293388 (102)	total: 14.1s	remaining: 19.4s
105:	learn: 0.4616656	test: 

32:	learn: 0.4150651	test: 0.3830579	best: 0.3830579 (32)	total: 4.38s	remaining: 28.8s
33:	learn: 0.4166150	test: 0.3822314	best: 0.3830579 (32)	total: 4.51s	remaining: 28.7s
34:	learn: 0.4159950	test: 0.3814050	best: 0.3830579 (32)	total: 4.64s	remaining: 28.5s
35:	learn: 0.4185782	test: 0.3805785	best: 0.3830579 (32)	total: 4.78s	remaining: 28.4s
36:	learn: 0.4195082	test: 0.3822314	best: 0.3830579 (32)	total: 4.91s	remaining: 28.3s
37:	learn: 0.4200248	test: 0.3822314	best: 0.3830579 (32)	total: 5.04s	remaining: 28.1s
38:	learn: 0.4219880	test: 0.3838843	best: 0.3838843 (38)	total: 5.17s	remaining: 28s
39:	learn: 0.4240546	test: 0.3851240	best: 0.3851240 (39)	total: 5.3s	remaining: 27.8s
40:	learn: 0.4261211	test: 0.3838843	best: 0.3851240 (39)	total: 5.43s	remaining: 27.7s
41:	learn: 0.4260178	test: 0.3826446	best: 0.3851240 (39)	total: 5.57s	remaining: 27.6s
42:	learn: 0.4270510	test: 0.3826446	best: 0.3851240 (39)	total: 5.7s	remaining: 27.4s
43:	learn: 0.4277743	test: 0.3838843

126:	learn: 0.4726183	test: 0.4074380	best: 0.4090909 (122)	total: 17.1s	remaining: 16.6s
127:	learn: 0.4738582	test: 0.4111570	best: 0.4111570 (127)	total: 17.3s	remaining: 16.4s
128:	learn: 0.4740649	test: 0.4086777	best: 0.4111570 (127)	total: 17.4s	remaining: 16.3s
129:	learn: 0.4734449	test: 0.4090909	best: 0.4111570 (127)	total: 17.5s	remaining: 16.2s
130:	learn: 0.4739616	test: 0.4099174	best: 0.4111570 (127)	total: 17.7s	remaining: 16s
131:	learn: 0.4743749	test: 0.4115702	best: 0.4115702 (131)	total: 17.8s	remaining: 15.9s
132:	learn: 0.4750982	test: 0.4099174	best: 0.4115702 (131)	total: 17.9s	remaining: 15.8s
133:	learn: 0.4747882	test: 0.4111570	best: 0.4115702 (131)	total: 18.1s	remaining: 15.6s
134:	learn: 0.4739616	test: 0.4103306	best: 0.4115702 (131)	total: 18.2s	remaining: 15.5s
135:	learn: 0.4736516	test: 0.4103306	best: 0.4115702 (131)	total: 18.3s	remaining: 15.4s
136:	learn: 0.4746849	test: 0.4111570	best: 0.4115702 (131)	total: 18.5s	remaining: 15.2s
137:	learn: 

36:	learn: 0.4205414	test: 0.4024793	best: 0.4028926 (34)	total: 5.09s	remaining: 29.3s
37:	learn: 0.4231246	test: 0.4057851	best: 0.4057851 (37)	total: 5.22s	remaining: 29.2s
38:	learn: 0.4227113	test: 0.4061983	best: 0.4061983 (38)	total: 5.36s	remaining: 29s
39:	learn: 0.4235379	test: 0.4057851	best: 0.4061983 (38)	total: 5.5s	remaining: 28.9s
40:	learn: 0.4242612	test: 0.4061983	best: 0.4061983 (38)	total: 5.64s	remaining: 28.7s
41:	learn: 0.4250878	test: 0.4061983	best: 0.4061983 (38)	total: 5.78s	remaining: 28.6s
42:	learn: 0.4238479	test: 0.4107438	best: 0.4107438 (42)	total: 5.92s	remaining: 28.5s
43:	learn: 0.4242612	test: 0.4078512	best: 0.4107438 (42)	total: 6.06s	remaining: 28.4s
44:	learn: 0.4248812	test: 0.4074380	best: 0.4107438 (42)	total: 6.2s	remaining: 28.3s
45:	learn: 0.4252945	test: 0.4078512	best: 0.4107438 (42)	total: 6.34s	remaining: 28.1s
46:	learn: 0.4260178	test: 0.4070248	best: 0.4107438 (42)	total: 6.48s	remaining: 28s
47:	learn: 0.4265344	test: 0.4078512	b

130:	learn: 0.4633189	test: 0.4227273	best: 0.4227273 (130)	total: 18.5s	remaining: 16.8s
131:	learn: 0.4638355	test: 0.4223140	best: 0.4227273 (130)	total: 18.6s	remaining: 16.7s
132:	learn: 0.4644555	test: 0.4223140	best: 0.4227273 (130)	total: 18.8s	remaining: 16.5s
133:	learn: 0.4642488	test: 0.4223140	best: 0.4227273 (130)	total: 18.9s	remaining: 16.4s
134:	learn: 0.4644555	test: 0.4214876	best: 0.4227273 (130)	total: 19.1s	remaining: 16.3s
135:	learn: 0.4648688	test: 0.4219008	best: 0.4227273 (130)	total: 19.2s	remaining: 16.1s
136:	learn: 0.4651788	test: 0.4219008	best: 0.4227273 (130)	total: 19.4s	remaining: 16s
137:	learn: 0.4654887	test: 0.4214876	best: 0.4227273 (130)	total: 19.5s	remaining: 15.8s
138:	learn: 0.4652821	test: 0.4219008	best: 0.4227273 (130)	total: 19.7s	remaining: 15.7s
139:	learn: 0.4661087	test: 0.4227273	best: 0.4227273 (130)	total: 19.8s	remaining: 15.6s
140:	learn: 0.4662120	test: 0.4223140	best: 0.4227273 (130)	total: 19.9s	remaining: 15.4s
141:	learn: 

222:	learn: 0.4825377	test: 0.4280992	best: 0.4309917 (180)	total: 32s	remaining: 3.87s
223:	learn: 0.4830544	test: 0.4280992	best: 0.4309917 (180)	total: 32.1s	remaining: 3.73s
224:	learn: 0.4838810	test: 0.4285124	best: 0.4309917 (180)	total: 32.3s	remaining: 3.59s
225:	learn: 0.4840876	test: 0.4297521	best: 0.4309917 (180)	total: 32.4s	remaining: 3.44s
226:	learn: 0.4843976	test: 0.4285124	best: 0.4309917 (180)	total: 32.6s	remaining: 3.3s
227:	learn: 0.4845009	test: 0.4289256	best: 0.4309917 (180)	total: 32.8s	remaining: 3.16s
228:	learn: 0.4843976	test: 0.4285124	best: 0.4309917 (180)	total: 32.9s	remaining: 3.02s
229:	learn: 0.4841909	test: 0.4289256	best: 0.4309917 (180)	total: 33.1s	remaining: 2.87s
230:	learn: 0.4851209	test: 0.4276860	best: 0.4309917 (180)	total: 33.2s	remaining: 2.73s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.4309917355
bestIteration = 180

Shrink model to first 181 iterations.
0:	learn: 0.1978510	test: 0.1934684	best: 0.1934684 (0)

84:	learn: 0.4531460	test: 0.4109136	best: 0.4109136 (83)	total: 12.6s	remaining: 24.4s
85:	learn: 0.4532493	test: 0.4092600	best: 0.4109136 (83)	total: 12.7s	remaining: 24.2s
86:	learn: 0.4533526	test: 0.4105002	best: 0.4109136 (83)	total: 12.9s	remaining: 24.1s
87:	learn: 0.4536626	test: 0.4109136	best: 0.4109136 (83)	total: 13s	remaining: 23.9s
88:	learn: 0.4542825	test: 0.4109136	best: 0.4109136 (83)	total: 13.2s	remaining: 23.8s
89:	learn: 0.4542825	test: 0.4096734	best: 0.4109136 (83)	total: 13.3s	remaining: 23.6s
90:	learn: 0.4550057	test: 0.4105002	best: 0.4109136 (83)	total: 13.4s	remaining: 23.5s
91:	learn: 0.4554189	test: 0.4100868	best: 0.4109136 (83)	total: 13.6s	remaining: 23.3s
92:	learn: 0.4553156	test: 0.4113270	best: 0.4113270 (92)	total: 13.7s	remaining: 23.2s
93:	learn: 0.4554189	test: 0.4117404	best: 0.4117404 (93)	total: 13.9s	remaining: 23s
94:	learn: 0.4553156	test: 0.4121538	best: 0.4121538 (94)	total: 14s	remaining: 22.9s
95:	learn: 0.4564521	test: 0.4117404	b

176:	learn: 0.4827978	test: 0.4241422	best: 0.4241422 (176)	total: 26.2s	remaining: 10.8s
177:	learn: 0.4823845	test: 0.4249690	best: 0.4249690 (177)	total: 26.3s	remaining: 10.6s
178:	learn: 0.4827978	test: 0.4245556	best: 0.4249690 (177)	total: 26.5s	remaining: 10.5s
179:	learn: 0.4827978	test: 0.4241422	best: 0.4249690 (177)	total: 26.6s	remaining: 10.4s
180:	learn: 0.4833144	test: 0.4253824	best: 0.4253824 (180)	total: 26.8s	remaining: 10.2s
181:	learn: 0.4833144	test: 0.4245556	best: 0.4253824 (180)	total: 26.9s	remaining: 10.1s
182:	learn: 0.4839343	test: 0.4241422	best: 0.4253824 (180)	total: 27.1s	remaining: 9.91s
183:	learn: 0.4835210	test: 0.4245556	best: 0.4253824 (180)	total: 27.2s	remaining: 9.76s
184:	learn: 0.4839343	test: 0.4249690	best: 0.4253824 (180)	total: 27.4s	remaining: 9.61s
185:	learn: 0.4840376	test: 0.4245556	best: 0.4253824 (180)	total: 27.5s	remaining: 9.46s
186:	learn: 0.4841409	test: 0.4249690	best: 0.4253824 (180)	total: 27.7s	remaining: 9.32s
187:	learn

18:	learn: 0.3877467	test: 0.3546920	best: 0.3546920 (18)	total: 2.74s	remaining: 33.3s
19:	learn: 0.3912594	test: 0.3579992	best: 0.3579992 (19)	total: 2.88s	remaining: 33.1s
20:	learn: 0.3936357	test: 0.3596527	best: 0.3596527 (20)	total: 3.03s	remaining: 33s
21:	learn: 0.3947722	test: 0.3592394	best: 0.3596527 (20)	total: 3.17s	remaining: 32.9s
22:	learn: 0.3969418	test: 0.3625465	best: 0.3625465 (22)	total: 3.32s	remaining: 32.8s
23:	learn: 0.4012811	test: 0.3629599	best: 0.3629599 (23)	total: 3.46s	remaining: 32.6s
24:	learn: 0.4016944	test: 0.3629599	best: 0.3629599 (23)	total: 3.61s	remaining: 32.5s
25:	learn: 0.4043806	test: 0.3633733	best: 0.3633733 (25)	total: 3.75s	remaining: 32.4s
26:	learn: 0.4071702	test: 0.3650269	best: 0.3650269 (26)	total: 3.9s	remaining: 32.2s
27:	learn: 0.4097531	test: 0.3670938	best: 0.3670938 (27)	total: 4.05s	remaining: 32.1s
28:	learn: 0.4100630	test: 0.3695742	best: 0.3695742 (28)	total: 4.21s	remaining: 32.1s
29:	learn: 0.4108896	test: 0.370814

112:	learn: 0.4608947	test: 0.3898305	best: 0.3906573 (75)	total: 16.7s	remaining: 20.3s
113:	learn: 0.4613080	test: 0.3898305	best: 0.3906573 (75)	total: 16.9s	remaining: 20.1s
114:	learn: 0.4613080	test: 0.3898305	best: 0.3906573 (75)	total: 17.1s	remaining: 20s
115:	learn: 0.4608947	test: 0.3914841	best: 0.3914841 (115)	total: 17.2s	remaining: 19.9s
116:	learn: 0.4609980	test: 0.3910707	best: 0.3914841 (115)	total: 17.3s	remaining: 19.7s
117:	learn: 0.4615146	test: 0.3910707	best: 0.3914841 (115)	total: 17.5s	remaining: 19.6s
118:	learn: 0.4617213	test: 0.3910707	best: 0.3914841 (115)	total: 17.6s	remaining: 19.4s
119:	learn: 0.4619279	test: 0.3910707	best: 0.3914841 (115)	total: 17.8s	remaining: 19.3s
120:	learn: 0.4620312	test: 0.3910707	best: 0.3914841 (115)	total: 17.9s	remaining: 19.1s
121:	learn: 0.4623412	test: 0.3910707	best: 0.3914841 (115)	total: 18.1s	remaining: 19s
122:	learn: 0.4623412	test: 0.3910707	best: 0.3914841 (115)	total: 18.2s	remaining: 18.8s
123:	learn: 0.462

In [12]:
test_df['result']=''
test_df['diff'] = ''

# predict by each model
for i in test_df.index:
    preds= []
    for model in models:
        pred = model.predict(vectorizer.transform([test_df.Title[i]]).toarray())
        preds.append(pred[0][0])
    
    # vote for predicted classes 
    max_count = 0
    result = ''
    for y in set(preds):
        if max_count < preds.count(y):
            max_count = preds.count(y)
            result = y 
    
    test_df.loc[i,'result'] = result       
        
    if test_df['result'][i] == test_df['label'][i]:
        test_df.loc[i,'diff'] = 1
    else:
        test_df.loc[i,'diff'] = 0


In [13]:
# count results
res = len(test_df[test_df['diff'] == 1])/len(test_df)
print(f'Result: {res}')

test_df.groupby('diff').count()['result']

Result: 0.4585123966942149


diff
0    1638
1    1387
Name: result, dtype: int64