<a href="https://colab.research.google.com/github/zidanseno/6pm/blob/main/goji.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Resources**
1.   [Multilabel Classification (Image)](https://github.com/ashrefm/multi-label-soft-f1/blob/master/Multi-Label%20Image%20Classification%20in%20TensorFlow%202.0.ipynb)
2.   [Preprocessing Layers (Tensorflow)](https://www.tensorflow.org/tutorials/structured_data/preprocessing_layers)
3. [Preprocessing Layers (Keras)](https://keras.io/examples/structured_data/structured_data_classification_from_scratch/)




In [None]:
import pandas as pd
import tensorflow as tf
import gdown
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras import layers

In [None]:
# Preparing the pandas dataframe

id = "1ekxGO1sbSWooxw0wvRv5sL-2P5f82hUU"
output = "datadummy_new_grouped.csv"
gdown.download(id=id, output=output, quiet=False)

df = pd.read_csv(f"/content/{output}")
df.head()

Downloading...
From: https://drive.google.com/uc?id=1ekxGO1sbSWooxw0wvRv5sL-2P5f82hUU
To: /content/datadummy_new_grouped.csv
100%|██████████| 135k/135k [00:00<00:00, 85.6MB/s]


Unnamed: 0.1,Unnamed: 0,Project Type,Topics,Sub Topic,Difficulty,Workers
0,0,Front End,Front End,Ember.js,9,"['Iga Narendra Pramawijaya', 'Muhammad Raden S..."
1,1,ML,Speech / Audio,Speech Recognition,8,"['I Putu Ranantha Nugraha Suparta', 'Putu Gede..."
2,2,ML,NLP,Sentiment Analysis,23,"['Alvin Tan', 'Bagja Kurniadi', 'Sarah Sema Kh..."
3,3,ML,NLP,Sentiment Analysis,19,"['Bagja Kurniadi', 'Alvin Tan', 'Sarah Sema Kh..."
4,4,Front End,Front End,Angular,7,"['Andi Rezal Oktavianto', 'Imam']"


In [None]:
# Cleaning the unused columns
df = df.drop(df.columns[[0]], axis=1)

# Transform 'Workers' from strings into lists
df['Workers'] = df['Workers'].str.replace("[\'\[\]]","",regex=True)
df['Workers'] = df['Workers'].str.replace(", ","|",regex=True)
df['Workers'] = df['Workers'].apply(lambda s: [l for l in str(s).split('|')])
df.head()

Unnamed: 0,Project Type,Topics,Sub Topic,Difficulty,Workers
0,Front End,Front End,Ember.js,9,"[Iga Narendra Pramawijaya, Muhammad Raden Syaw..."
1,ML,Speech / Audio,Speech Recognition,8,"[I Putu Ranantha Nugraha Suparta, Putu Gede Ag..."
2,ML,NLP,Sentiment Analysis,23,"[Alvin Tan, Bagja Kurniadi, Sarah Sema Khairun..."
3,ML,NLP,Sentiment Analysis,19,"[Bagja Kurniadi, Alvin Tan, Sarah Sema Khairun..."
4,Front End,Front End,Angular,7,"[Andi Rezal Oktavianto, Imam]"


In [None]:
# Transform other columns into strings
string_col = ['Topics', 'Sub Topic', 'Project Type']

for i in string_col:
    df[i] = df[i].astype("string")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Project Type  1000 non-null   string
 1   Topics        1000 non-null   string
 2   Sub Topic     1000 non-null   string
 3   Difficulty    1000 non-null   int64 
 4   Workers       1000 non-null   object
dtypes: int64(1), object(1), string(3)
memory usage: 39.2+ KB


In [None]:
# Creating list of labels
labels_list = df['Workers']
labels_list = list(labels_list)
mlb = MultiLabelBinarizer()
mlb.fit(labels_list)

N_LABELS = len(mlb.classes_)
for (i, label) in enumerate(mlb.classes_):
    print("{}. {}".format(i, label))

0. Abdullah Nur Hudi
1. Abiyyu Diora Haqi
2. Alvin Tan
3. Andhika Zulfikri
4. Andi Rezal Oktavianto
5. Azis Sofyanto
6. Bagja Kurniadi
7. Chairul Rizqi
8. Christopher Kristianto
9. Farel Eden
10. Gabriel Kheisa
11. I Putu Ranantha Nugraha Suparta
12. Iga Narendra Pramawijaya
13. Imam
14. Muhammad Raden Syawali Akbar
15. Nyoman Satiya Najwa Sadha
16. Putu Gede Agung Karna Sampalan
17. Rikip Ginanjar
18. Sandrian Yulianto
19. Sarah Sema Khairunisa
20. Suci Rahmadani
21. Vania Kylie
22. Wahyu Fauzan


In [None]:
# Split the dataframe

train, test = train_test_split(df, test_size=0.05)
train, val = train_test_split(train, test_size=0.05)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

902 train examples
48 validation examples
50 test examples


In [None]:
# Input panda dataframe into tf.data pipeline

mlb = MultiLabelBinarizer()
mlb.fit(df['Workers'])

def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  df = dataframe.copy() # copying the dataframe
  labels = df.pop('Workers') # separating the labels

  # Fit the multi-label binarizer on the training labels
  labels = list(labels)
  labels_bin = mlb.transform(labels)
  
  df = {key: value[:,tf.newaxis] for key, value in df.items()} # x
  ds = tf.data.Dataset.from_tensor_slices((dict(df), labels_bin))
  
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [None]:
# Numerical Columns

def get_normalization_layer(name, dataset):
  # Create a Normalization layer for the feature.
  normalizer = layers.Normalization()

  # Prepare a Dataset that only yields the feature.
  feature_ds = dataset.map(lambda x, y: x[name])
  #feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

  # Learn the statistics of the data.
  normalizer.adapt(feature_ds)

  return normalizer

In [None]:
# Categorical Columns

def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  # Create a layer that turns strings into integer indices.
  if dtype == 'string':
    index = layers.StringLookup(max_tokens=max_tokens)
  # Otherwise, create a layer that turns integer values into integer indices.
  else:
    index = layers.IntegerLookup(max_tokens=max_tokens)

  # Prepare a `tf.data.Dataset` that only yields the feature.
  feature_ds = dataset.map(lambda x, y: x[name])
  #feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt(feature_ds)

  # Encode the integer indices.
  encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())

  # Apply multi-hot encoding to the indices. The lambda function captures the
  # layer, so you can use them, or include them in the Keras Functional model later.
  return lambda feature: encoder(index(feature))

In [None]:
# Split and preprocess the dataframe

batch_size = 10
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

  df = {key: value[:,tf.newaxis] for key, value in df.items()} # x


In [None]:
all_inputs = []
encoded_features = []

# Numerical features

numeric_cols = ['Difficulty']

for header in numeric_cols:
  numeric_col = tf.keras.Input(shape=(1,), name=header)
  normalization_layer = get_normalization_layer(header, train_ds)
  encoded_numeric_col = normalization_layer(numeric_col)
  all_inputs.append(numeric_col)
  encoded_features.append(encoded_numeric_col)

# Categorical features

categorical_cols = ['Topics', 'Sub Topic', 'Project Type']

for header in categorical_cols:
  categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
  encoding_layer = get_category_encoding_layer(name=header,
                                               dataset=train_ds,
                                               dtype='string')
  encoded_categorical_col = encoding_layer(categorical_col)
  all_inputs.append(categorical_col)
  encoded_features.append(encoded_categorical_col)


[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'Difficulty')>, <KerasTensor: shape=(None, 1) dtype=string (created by layer 'Topics')>, <KerasTensor: shape=(None, 1) dtype=string (created by layer 'Sub Topic')>, <KerasTensor: shape=(None, 1) dtype=string (created by layer 'Project Type')>]
KerasTensor(type_spec=TensorSpec(shape=(None, 1), dtype=tf.string, name='Project Type'), name='Project Type', description="created by layer 'Project Type'")
[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'Difficulty')>, <KerasTensor: shape=(None, 1) dtype=string (created by layer 'Topics')>, <KerasTensor: shape=(None, 1) dtype=string (created by layer 'Sub Topic')>, <KerasTensor: shape=(None, 1) dtype=string (created by layer 'Project Type')>]
KerasTensor(type_spec=TensorSpec(shape=(None, 1), dtype=tf.string, name='Project Type'), name='Project Type', description="created by layer 'Project Type'")
[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'Difficul

In [None]:

# list of numbers
numbers = numeric_cols
 
# find the size of the list
size = sum(1 for num in numbers)
 
# print the size of the list
print(size)

1


In [None]:
# Creating the model

all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(1028, activation="relu")(all_features)
#x = tf.keras.layers.Dense(50, activation='relu')(x)
#x = tf.keras.layers.Dense(128, activation='relu')(x)
output = tf.keras.layers.Dense(N_LABELS, activation='sigmoid')(x)

model = tf.keras.Model(all_inputs, output)

opt = tf.keras.optimizers.Adam(learning_rate=0.01)

model.compile(optimizer=opt,
              loss='binary_crossentropy',
              metrics=["accuracy"])

In [None]:
"""all_features = tf.keras.layers.concatenate(encoded_features)

model = tf.keras.Sequential([
    all_features,
    layers.Dense(100, activation='relu'),
    layers.Dense(50, activation='relu'),
    layers.Dense(N_LABELS, activation='linear')
    
])
model.summary"""

"all_features = tf.keras.layers.concatenate(encoded_features)\n\nmodel = tf.keras.Sequential([\n    all_features,\n    layers.Dense(100, activation='relu'),\n    layers.Dense(50, activation='relu'),\n    layers.Dense(N_LABELS, activation='linear')\n    \n])\nmodel.summary"

In [None]:
# Training the model

model.fit(train_ds, epochs=100, validation_data=val_ds)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7fa964433610>

In [None]:
# Evaluate the model's accuracy

loss, accuracy = model.evaluate(test_ds)
print("Accuracy: ", accuracy)

Accuracy:  0.07999999821186066
