In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import layers, Model, Sequential

import numpy as np

from tqdm import tqdm

In [None]:
df = pd.read_csv("./dataset/career_pred.csv")

In [None]:
df.iloc[:49,:-1].columns

In [None]:
labels = df["Suggested Job Role"]
labels_unique = labels.unique().tolist()
labels = labels.map(lambda x: labels_unique.index(x))
labels = to_categorical(labels)

In [None]:
df = df[df.columns[:-1]]

In [None]:
df[""]

In [None]:
k = {}
k["certification"]=df["certifications"].unique().tolist()
k["workshops"]=df["workshops"].unique().tolist()
k["Interested subjects"]=df["Interested subjects"].unique().tolist()
k["interested career area"]=df["interested career area "].unique().tolist()
k['Type of company want to settle in?']=df['Type of company want to settle in?'].unique().tolist()

In [None]:
labels_unique

In [None]:
exclusion = [
    'Hours working per day',
    'Logical quotient rating',
    'hackathons',
    'coding skills rating',
    'public speaking points']

items_1 = [
    'Acedamic percentage in Operating Systems', 
    'percentage in Algorithms',
    'Percentage in Programming Concepts',
    'Percentage in Software Engineering', 
    'Percentage in Computer Networks',
    'Percentage in Electronics Subjects',
    'Percentage in Computer Architecture', 
    'Percentage in Mathematics',
    'Percentage in Communication skills']

items_2 = [
    'can work long time before system?',
    'self-learning capability?', 
    'Extra-courses did',
    'Management or Technical', 
    'Salary/work', 
    'hard/smart worker', 
    'worked in teams ever?', 
    'Introvert', 
    'Job/Higher Studies?']

items_3 = [
    'certifications',
    'workshops',
    'reading and writing skills',
    'memory capability score',
    'Interested subjects',
    'interested career area ',
    'Type of company want to settle in?']

print(items_1+items_2+items_3+exclusion)

def convert_to_x(row):
    global df

    labels = []

    for col in row.index:
        if col in items_1:
            labels.append(row[col]/100)
        elif col in items_2:
            labels.append(0 if row[col] == "no" else 1)
        elif col in items_3:
            uniques = df[col].unique().tolist()
            labels.append(uniques.index(row[col]))
        elif col in exclusion:
            labels.append(row[col])

    return labels

In [None]:
df.iloc[0].index

In [None]:
xs = np.array([convert_to_x(df.iloc[i]) for i in tqdm(df.index)])
np.save("./dataset/xs.npy", xs)

In [None]:
xs = np.load("./dataset/xs.npy")

In [None]:
def group_list(l, group_size):
    for i in range(0, len(l), group_size):
        yield l[i:i+group_size]

BATCH_SIZE = 100

split_at = int(len(xs)//BATCH_SIZE * 0.8)
dataset = tf.data.Dataset.from_tensor_slices((xs, labels)).shuffle(1000).batch(BATCH_SIZE)

train_dataset = dataset.take(split_at)
test_dataset = dataset.skip(split_at)

In [None]:
sum_items = len(items_1) + len(items_2) + len(items_3) + len(exclusion)

In [None]:
def make_model():
    _input = layers.Input(shape=(sum_items,))

    a, b, c, d = tf.split(_input, [len(items_1), len(items_2), len(items_3), len(exclusion)], axis=1)

    e1 = layers.Dense(32, activation="relu")(a)
    # # e1 = layers.LeakyReLU(0.2)(e1)
    e2 = layers.Dense(32, activation="relu")(b)
    # # e1 = layers.LeakyReLU(0.2)(e1)
    e3 = layers.Dense(32, activation="relu")(c)
    # # e1 = layers.LeakyReLU(0.2)(e1)
    e4 = layers.Dense(32, activation="relu")(d)
    # e4 = layers.LeakyReLU(0.2)(e4)

    e = layers.concatenate([e1, e2, e3, e4])
    # e = layers.Dropout(0.2)(e)
    # e = layers.LayerNormalization()(e)

    # use multihead attention
    attention = layers.MultiHeadAttention(num_heads=8, key_dim=8, attention_axes=(1,))(_input, _input)
    # attention = layers.LayerNormalization()(attention)

    e = layers.Dense(1024, activation="relu")(e)
    e = layers.Dropout(0.2)(e)
    e = layers.Dense(1024, activation="relu")(e)
    e = layers.Dropout(0.2)(e)
    e = layers.Dense(512, activation="relu")(e)
    e = layers.Dropout(0.2)(e)
    e = layers.Dense(256, activation="relu")(e)
    e = layers.Dense(labels.shape[-1], activation="softmax")(e)

    return Model(inputs=[_input], outputs=e)

model = make_model()
model.compile(
    # optimizer=keras.optimizers.Adam(learning_rate=0.0001, epsilon=1e-07),
    optimizer=keras.optimizers.Adagrad(learning_rate=0.01),
    # optimizer=keras.optimizers.Nadam(learning_rate=0.0001),
    loss="categorical_crossentropy",
    metrics=["categorical_accuracy"]
)

In [223]:
model.summary()

Model: "model_31"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_41 (InputLayer)          [(None, 31)]         0           []                               
                                                                                                  
 tf.split_29 (TFOpLambda)       [(None, 9),          0           ['input_41[0][0]']               
                                 (None, 9),                                                       
                                 (None, 8),                                                       
                                 (None, 5)]                                                       
                                                                                                  
 dense_256 (Dense)              (None, 32)           320         ['tf.split_29[0][0]']     

In [None]:
history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=1000,
)

In [None]:
model.save("./models/main.h5")

In [None]:
x = [
    0.82,0.81,0.85,0.78,0.80,0.70,0.80,0.91,0.81,
    10,8,3,8,6,
    1,1,1,
    1,3,0,1,
    2,2,7,2,
    0,8,0,1,29,0,
    1,1,1,1,1,1,1
]

In [None]:
(model(np.expand_dims(np.array(x), axis=0)).numpy() * 100)[0].tolist()

In [None]:
labels_unique[0]

In [None]:
test = np.expand_dims(np.array(convert_to_x(df.iloc[1000])), axis=0)
labels[1000].tolist().index(1)

In [None]:
np.argmax(model(test))

In [None]:
df["interested career area "].unique()

In [None]:
labels_unique