# Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
import tensorflow as tf

In [None]:
!pip install -q sastrawi
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# Production

In [None]:
def normalize_text(text):
  # case folding
  text = text.lower()
  # number removal
  text = re.sub(r'\d+', '', text)
  # punctuation removal
  text = re.sub(r'[^\w\s]', ' ', text)
  # extra \n and \s removal
  text = re.sub(r'\s+', ' ', text.replace('\n', ' ')).strip()
  # stopword removal
  stopword_remover = StopWordRemoverFactory().create_stop_word_remover()
  text = stopword_remover.remove(text)
  return text

In [None]:
def build_model(input_dim):
  model = model = tf.keras.Sequential()
  model.add(tf.keras.layers.Dense(32, input_dim=input_dim, activation='relu'))
  model.add(tf.keras.layers.Dropout(0.5))
  model.add(tf.keras.layers.Dense(29, activation='softmax'))
  model.compile(loss='categorical_crossentropy',
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                metrics=['accuracy'])
  return model

In [None]:
# load the dataset into pandas dataframe
df = pd.read_csv("/content/data.csv")
# drop rows with missing and duplicate value
df_clean = df.dropna().drop_duplicates()

In [None]:
# prepare feature (X) and target (y) variable for model training & validation purpose
# the feature that will be used is a preprocessed form of the raw article content
X = pd.Series(df_clean['article_content'].map(lambda x: normalize_text(x)))
y = df_clean['article_topic']

In [None]:
# the original dataset will be divided based on article topic distribution (stratified sampling)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=df_clean['article_topic'])

In [None]:
# vectorize train & validation set so that it can be further processed/learned by the ML model
feature_vectorizer = TfidfVectorizer(lowercase=False)
vectorizer = feature_vectorizer.fit(X_train)
vectorized_X_train = feature_vectorizer.transform(X_train).toarray()
vectorized_X_val = feature_vectorizer.transform(X_val).toarray()
# tfidfvectorizer produces a sparse array and this is not good for keras, hence we use toarray()
target_vectorizer = LabelBinarizer()
target_vectorizer.fit(y_train)
vectorized_y_train = target_vectorizer.transform(y_train)
vectorized_y_val = target_vectorizer.transform(y_val)

In [None]:
# develop the model
model = build_model(vectorized_X_train.shape[1])

In [None]:
# train the model
training_history = model.fit(vectorized_X_train, vectorized_y_train,
                             epochs=100,
                             verbose=1,
                             validation_data=(vectorized_X_val, vectorized_y_val),
                             batch_size=32,
                             callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode="min", patience=5, verbose=1)])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 24: early stopping


In [None]:
input = '''
Menteri Koordinator Bidang Perekonomian, Airlangga Hartarto membeberkan strategi ASEAN untuk menjadi mesin pertumbuhan ekonomi global.
Airlangga menilai, kondisi ekonomi global saat ini sangat dinamis. Meski begitu, ASEAN berhasil pulih bahkan melampaui situasi pra-pandemi dengan total PDB USD 3,6 triliun di tahun 2022.
Di sisi lain, Airlangga memproyeksi perekonomian global ke depan mengindikasikan pelemahan dan ketidakpastian pertumbuhan. Hal tersebut memberikan tantangan terhadap pertumbuhan ekonomi kawasan.
Untuk itu, Airlangga bersama dengan Menteri Ekonomi se-ASEAN menyiapkan strategi jitu untuk menjadikan ASEAN sebagai mesin pertumbuhan ekonomi global
'''

In [None]:
processed_input = normalize_text(input)
vectorized_input = vectorizer.transform([processed_input]).toarray()
predicted_label = np.argmax(model.predict(vectorized_input), axis=-1)[0]

label_dict = dict(enumerate(target_vectorizer.classes_))
label_dict[predicted_label]



'Ekonomi'

# model.py

In [None]:
class Model:
    def __init__(self):
        self.vectorizer = None
        self.label_dict = None
        self.trained_model = None

    def normalize_text(self, text):
        # case folding
        text = text.lower()
        # number removal
        text = re.sub(r'\d+', '', text)
        # punctuation removal
        text = re.sub(r'[^\w\s]', ' ', text)
        # extra \n and \s removal
        text = re.sub(r'\s+', ' ', text.replace('\n', ' ')).strip()
        # stopword removal
        stopword_remover = StopWordRemoverFactory().create_stop_word_remover()
        text = stopword_remover.remove(text)
        return text

    def prepare_dataset(self, data_path):
        # load the dataset into pandas dataframe
        df = pd.read_csv(data_path)
        # drop rows with missing and duplicate value
        df_clean = df.dropna().drop_duplicates()

        # prepare feature (X) and target (y) variable for model training & validation purpose
        # the feature that will be used is a preprocessed form of the raw article content
        X = pd.Series(df_clean['article_content'].map(lambda x: self.normalize_text(x)))
        y = df_clean['article_topic']

        # the original dataset will be divided based on article topic distribution (stratified sampling)
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=df_clean['article_topic'])

        # vectorize train & validation set so that it can be further processed/learned by the ML model
        feature_vectorizer = TfidfVectorizer(lowercase=False)
        self.vectorizer = feature_vectorizer.fit(X_train)
        vectorized_X_train = feature_vectorizer.transform(X_train).toarray()
        vectorized_X_val = feature_vectorizer.transform(X_val).toarray()
        # tfidfvectorizer produces a sparse array and this is not good for keras, hence we use toarray()
        target_vectorizer = LabelBinarizer()
        target_vectorizer.fit(y_train)
        self.label_dict = dict(enumerate(target_vectorizer.classes_))
        vectorized_y_train = target_vectorizer.transform(y_train)
        vectorized_y_val = target_vectorizer.transform(y_val)

        return vectorized_X_train, vectorized_X_val, vectorized_y_train, vectorized_y_val

    def build_model(self, input_dim):
        model = model = tf.keras.Sequential()
        model.add(tf.keras.layers.Dense(32, input_dim=input_dim, activation='relu'))
        model.add(tf.keras.layers.Dense(32, activation='relu'))
        model.add(tf.keras.layers.Dropout(0.5))
        model.add(tf.keras.layers.Dense(29, activation='softmax'))
        model.compile(loss='categorical_crossentropy',
                      optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                      metrics=['accuracy'])
        return model

    def train(self):
        X_train, X_test, y_train, y_test = self.prepare_dataset("/content/data.csv")
        self.trained_model = self.build_model(X_train.shape[1])
        training = self.trained_model.fit(X_train, y_train,
                                          epochs=100,
                                          verbose=0,
                                          validation_data=(X_test, y_test),
                                          batch_size=32,
                                          callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode="min", patience=5, verbose=0)]
                                          )

    def predict(self, input):
        processed_input = self.vectorizer.transform([self.normalize_text(input)]).toarray()
        predicted_label = np.argmax(self.trained_model.predict(processed_input, verbose=0), axis=-1)[0]
        return self.label_dict[predicted_label]

    def save(self):
        """
        Save trained model to model.pickle file.
        """
        ds.model.save(self, "model.pickle")


if __name__ == '__main__':
    # NOTE: Edit this if you add more initialization parameter
    model = Model()

    # # Train your model
    # model.train()

    # # Save your trained model to model.pickle
    # model.save()

In [None]:
new_model = Model()

In [None]:
new_model.train()

In [None]:
temp_1 = '''
Menteri Koordinator Bidang Perekonomian, Airlangga Hartarto membeberkan strategi ASEAN untuk menjadi mesin pertumbuhan ekonomi global.
Airlangga menilai, kondisi ekonomi global saat ini sangat dinamis. Meski begitu, ASEAN berhasil pulih bahkan melampaui situasi pra-pandemi dengan total PDB USD 3,6 triliun di tahun 2022.
Di sisi lain, Airlangga memproyeksi perekonomian global ke depan mengindikasikan pelemahan dan ketidakpastian pertumbuhan. Hal tersebut memberikan tantangan terhadap pertumbuhan ekonomi kawasan.
Untuk itu, Airlangga bersama dengan Menteri Ekonomi se-ASEAN menyiapkan strategi jitu untuk menjadikan ASEAN sebagai mesin pertumbuhan ekonomi global
'''

In [None]:
new_model.predict(temp_1)

'Ekonomi'

In [None]:
temp_2 = '''
IMEI, atau International Mobile Equipment Identity, adalah nomor unik yang diberikan kepada setiap perangkat ponsel. Nomor ini memberikan identitas yang unik bagi setiap ponsel yang ada di dunia. Pastinya, ada fungsi IMEI yang perlu diketahui oleh para pengguna.
Menurut buku Tips Ampuh Android, Tri Amperianto (2014:158), pada umumnya, IMEI berjumlah 15 digit atau lebih. Setiap perangkat akan mempunyai nomor IMEI tidak sama.
Pengguna dapat memeriksa IMEI pada bagian belakang perangkat, atau biasanya tertempel pada sticker yang berada di bagian belakang boks tersebut.
'''

In [None]:
new_model.predict(temp_2)

'Teknologi'

In [None]:
temp_3 = '''
Manchester City memetik kemenangan saat bertemu Fulham di laga keempat Liga Inggris. Bermain di Etihad Stadium, Sabtu (2/9) malam WIB, Man City menang dengan skor 4-1.
Kemenangan Man City dinodai dengan gol yang dibuat Nathan Ake di injury time babak pertama. Sundulan Ake mengarah ke Manuel Akanji yang berada dalam posisi offside. Namun, Akanji meloloskan bola tanpa menyentuhnya hingga bola masuk ke gawang.
Striker Man City, Erling Haaland, mengatakan wasit harusnya menganulir gol tersebut. Haaland bahkan amat merasakan kekesalan yang dialami pemain Fulham.
'''

In [None]:
new_model.predict(temp_3)

'Sepak Bola'

## Test the model

In [None]:
import pickle

In [None]:
filename_to_be_saved = 'model.pickle'

In [None]:
pickle.dump(new_model, open(filename_to_be_saved, 'wb'))

In [None]:
pickled_model = pickle.load(open('model.pickle', 'rb'))

In [None]:
pickled_model.predict(temp_3)

'Sepak Bola'