In [3]:
import pandas as pd
import random
import numpy as np
import json
import ast

import xgboost as xgb
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

In [4]:
def literal_converter(val):
    # replace first val with '' or some other null identifier if required
    return val if val == '' else ast.literal_eval(val)

converters = {}
for c in ['skill_' + str(i) for i in range(1, 6)]:
    converters[c] = literal_converter

dataset = pd.read_csv('label_data2.csv', converters=converters).replace('', np.nan)

In [5]:
df = dataset.head(9)
df

Unnamed: 0,skill_1,skill_2,skill_3,skill_4,skill_5,label
0,"(BackEnd, 5)","(Python, 5)","(FrontEnd, 1)",,,
1,"(Networks, 3)","(NLP, 4)","(Security, 5)","(Machine Learning, 5)","(Computer Vision, 4)",1.0
2,,,,,,
3,"(Security, 3)","(BackEnd, 3)","(Java, 3)","(Networks, 4)","(JavaScript, 3)",
4,"(Java, 5)","(Latex, 3)","(BackEnd, 5)","(FrontEnd, 2)",,1.0
5,,,,,,
6,"(Machine Learning, 5)","(FrontEnd, 3)","(BackEnd, 3)",,,
7,"(FrontEnd, 3)","(Database, 1)","(Networks, 5)",,,1.0
8,,,,,,


In [6]:
language_skills = ['Java', 'C++', 'Python', 'JavaScript', 'Latex']
package_skills = ['FrontEnd', 'BackEnd', 'Database', 'Networks', 'Security', 'Machine Learning',
'NLP', 'Computer Vision']
skills = language_skills + package_skills
skills2idx = dict(zip(skills, [i for i in range(len(skills))]))
skills2idx

{'Java': 0,
 'C++': 1,
 'Python': 2,
 'JavaScript': 3,
 'Latex': 4,
 'FrontEnd': 5,
 'BackEnd': 6,
 'Database': 7,
 'Networks': 8,
 'Security': 9,
 'Machine Learning': 10,
 'NLP': 11,
 'Computer Vision': 12}

In [7]:
cols = skills + skills + ['label']
def transform_rows(baserow_num, dataset):
    row = np.zeros(len(cols))
    skills_1 = dataset.iloc[baserow_num, :-1].dropna()
    for s in skills_1:
        row[skills2idx[s[0]]] = s[1]
    skills_2 = dataset.iloc[baserow_num + 1, :-1].dropna()
    for s in skills_2:
        row[skills2idx[s[0]] + len(skills)] = s[1]
    row[-1] = dataset.iloc[baserow_num + 1, -1]
    return row

In [8]:
def transform_data(dataset, func):
    df = []
    for i in range(len(dataset)):
        if i % 3 == 0:
            df.append(func(i, dataset))
    df = pd.DataFrame(df).dropna().to_numpy()
    return df.copy()

In [9]:
data = transform_data(dataset, transform_rows)
X_train, X_test, y_train, y_test = train_test_split(data[:, :-1], data[:, -1], test_size=0.25, random_state=100)

In [35]:
cols

['Java',
 'C++',
 'Python',
 'JavaScript',
 'Latex',
 'FrontEnd',
 'BackEnd',
 'Database',
 'Networks',
 'Security',
 'Machine Learning',
 'NLP',
 'Computer Vision',
 'Java',
 'C++',
 'Python',
 'JavaScript',
 'Latex',
 'FrontEnd',
 'BackEnd',
 'Database',
 'Networks',
 'Security',
 'Machine Learning',
 'NLP',
 'Computer Vision',
 'label']

In [10]:
xgb_clf = xgb.XGBClassifier(objective='binary:logistic',
                            n_estimators=1000,
                            eval_metric='logloss', 
                            use_label_encoder=False,
                            max_depth=5,
                            min_child_weight=1,
                            gamma=0,
                            subsample=0.8,  
                            colsample_bytree=0.8,
                            learning_rate=0.1)

In [11]:
train_matrix = xgb.DMatrix(X_train, label=y_train)
cv_res = xgb.cv(xgb_clf.get_xgb_params(), 
                  train_matrix, 
                  num_boost_round=xgb_clf.get_params()['n_estimators'], 
                  nfold=5,
                  metrics='logloss', 
                  early_stopping_rounds=50)
xgb_clf.set_params(n_estimators=cv_res.shape[0])

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.8,
              enable_categorical=False, eval_metric='logloss', gamma=0,
              gpu_id=None, importance_type=None, interaction_constraints=None,
              learning_rate=0.1, max_delta_step=None, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=41, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=None, subsample=0.8,
              tree_method=None, use_label_encoder=False,
              validate_parameters=None, verbosity=None)

In [12]:
# Tuning max_depth & min_child_weight
param_grid = {'max_depth': range(3, 10),
              'min_child_weight': range(1, 6)}
grid_search = GridSearchCV(xgb_clf, param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train, y_train)

print('best_params:', grid_search.best_params_)
print('best_score:', grid_search.best_score_)

best_params: {'max_depth': 3, 'min_child_weight': 2}
best_score: 0.676083916083916


In [13]:
xgb_clf.set_params(max_depth=3, min_child_weight=2) 

# Tuning gamma
param_grid = {'gamma': [i/10.0 for i in range(20)]}
grid_search = GridSearchCV(xgb_clf, param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train, y_train)

print('best_params:', grid_search.best_params_)
print('best_score:', grid_search.best_score_)

best_params: {'gamma': 1.5}
best_score: 0.6853146853146852


In [14]:
xgb_clf.set_params(gamma=1.5) 

# Tuning subsample and colsample_bytree
param_grid = {'subsample': [i/10.0 for i in range(2, 11)],
              'colsample_bytree': [i/10.0 for i in range(2, 11)]}
grid_search = GridSearchCV(xgb_clf, param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train, y_train)

print('best_params:', grid_search.best_params_)
print('best_score:', grid_search.best_score_)

best_params: {'colsample_bytree': 0.9, 'subsample': 1.0}
best_score: 0.7003263403263403


In [15]:
xgb_clf.set_params(subsample=1.0, colsample_bytree=0.9)

# Tuning regularization lambda
param_grid = {'reg_lambda': [i/10.0 for i in range(1, 20)]}
grid_search = GridSearchCV(xgb_clf, param_grid, scoring='accuracy', cv=5)
grid_search.fit(X_train, y_train)

print('best_params:', grid_search.best_params_)
print('best_score:', grid_search.best_score_)

best_params: {'reg_lambda': 1.0}
best_score: 0.7003263403263403


In [16]:
xgb_clf.set_params(reg_lambda=1.0)
xgb_clf.fit(X_train, y_train)
pred_test = xgb_clf.predict(X_test)

In [17]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

         0.0       0.75      0.82      0.78        61
         1.0       0.74      0.65      0.70        49

    accuracy                           0.75       110
   macro avg       0.75      0.74      0.74       110
weighted avg       0.75      0.75      0.74       110



In [18]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
param_grid = {
    'n_estimators': [10, 20, 50, 80, 100],
    'max_depth': [2, 10, 20, None]
}

rf_cv = GridSearchCV(rf, param_grid, scoring='accuracy', cv=5)
rf_cv.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [2, 10, 20, None],
                         'n_estimators': [10, 20, 50, 80, 100]},
             scoring='accuracy')

In [19]:
print('best_params:', rf_cv.best_params_)
print('best_score:', rf_cv.best_score_)

best_params: {'max_depth': 10, 'n_estimators': 50}
best_score: 0.6453146853146853


In [20]:
rf = rf_cv.best_estimator_
rf.fit(X_train, y_train)
pred_test = rf.predict(X_test)

In [21]:
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

         0.0       0.72      0.69      0.71        61
         1.0       0.63      0.67      0.65        49

    accuracy                           0.68       110
   macro avg       0.68      0.68      0.68       110
weighted avg       0.68      0.68      0.68       110



In [22]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
pred_test = gnb.fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

         0.0       0.73      0.74      0.73        61
         1.0       0.67      0.65      0.66        49

    accuracy                           0.70       110
   macro avg       0.70      0.70      0.70       110
weighted avg       0.70      0.70      0.70       110



In [23]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=8)
pred_test = knn.fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, pred_test))

              precision    recall  f1-score   support

         0.0       0.58      0.87      0.70        61
         1.0       0.58      0.22      0.32        49

    accuracy                           0.58       110
   macro avg       0.58      0.55      0.51       110
weighted avg       0.58      0.58      0.53       110



In [28]:
param_grid = {'n_neighbots': range(1, 31)}
knn_cv = GridSearchCV(knn, param_grid, scoring='accuracy')

In [29]:
new_dataset = pd.read_csv('new_label_data.csv', converters=converters).replace('', np.nan)

def new_transform_rows(baserow_num, dataset):
    row = np.zeros(len(cols)-1)
    skills_1 = dataset.iloc[baserow_num, :-1].dropna()
    for s in skills_1:
        row[skills2idx[s[0]]] = s[1]
    skills_2 = dataset.iloc[baserow_num + 1, :-1].dropna()
    for s in skills_2:
        row[skills2idx[s[0]] + len(skills)] = s[1]
    return row

new_data = transform_data(new_dataset, new_transform_rows)

FileNotFoundError: [Errno 2] No such file or directory: 'new_label_data.csv'

In [30]:
new_pred = xgb_clf.predict(new_data)

for i in range(len(new_dataset)):
    if i % 3 == 1:
        new_dataset.iloc[i, -1] = new_pred[int(i/3)]

NameError: name 'new_data' is not defined

In [27]:
new_dataset.to_csv('new_data.csv', index=False)

NameError: name 'new_dataset' is not defined