In [20]:
import pandas as pd
import numpy as np
from src.text_preprocessing import generate_preprocessor
import src.utils as utils
from src.model import train_model, get_best_model, get_best_threshold


# Read Data
def read_data(return_file=True):
    existing_data = utils.load_json(CONFIG_DATA["data_set_path"])
    existing_data = existing_data.drop_duplicates(
        subset=CONFIG_DATA["text_column"], keep="first"
    )
    print("Existing data inputted, data shape  :", existing_data.shape)
    new_data = pd.read_excel(CONFIG_DATA["raw_new_dataset_path"])

    # print data
    print("new data inputted, data shape  :", new_data.shape)

    # Remove duplicates data
    new_data = new_data.drop_duplicates(subset=CONFIG_DATA["text_column"], keep="first")
    new_data_exc = new_data[
        ~new_data[CONFIG_DATA["text_column"]].isin(
            existing_data[CONFIG_DATA["text_column"]]
        )
    ]
    data = pd.concat([existing_data, new_data_exc], axis=0, ignore_index=True)

    # Print data
    print("ready read file, data shape   :", new_data_exc.shape)

    # Return data
    if return_file:
        return {"new_data": new_data_exc, "data": data}

In [31]:
import os
from fastapi import FastAPI, HTTPException
from fastapi.responses import HTMLResponse
from sklearn.feature_extraction.text import TfidfVectorizer
from fastapi.middleware.cors import CORSMiddleware
import uvicorn
import src.utils as utils
import pandas as pd
import re
import nltk
import numpy as np

nltk.download("stopwords")
nltk.download("punkt")
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter
from sklearn.preprocessing import LabelEncoder

CONFIG_DATA = utils.config_load()

app = FastAPI()

# Add CORS middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["*"],
    allow_headers=["*"],
)


# Load your pre-trained ML model
class Model:
    def clean_data(self, data, CONFIG_DATA=CONFIG_DATA, return_file=True):
        # print("cleaning the data")

        # Lowering Case
        data[CONFIG_DATA["text_column"]] = data[CONFIG_DATA["text_column"]].str.lower()

        # Remove Non ASCII
        data[CONFIG_DATA["text_column"]] = (
            data[CONFIG_DATA["text_column"]]
            .str.encode("ascii", "ignore")
            .str.decode("ascii")
        )

        # Remove Whitespace in Start and End
        data[CONFIG_DATA["text_column"]] = data[CONFIG_DATA["text_column"]].str.strip()

        # Punctuation Removal Code
        data[CONFIG_DATA["text_column"]] = data[CONFIG_DATA["text_column"]].apply(
            lambda text: re.sub(
                r"[{}]".format(re.escape(string.punctuation)), "", str(text)
            )
        )

        # Remove Mention, Link, Hashtag, etc
        def replace_str(text):
            cleaned_text = re.sub(
                r"[\n\r\t]", " ", str(text)
            )  # Remove Tab, Enter, Space
            cleaned_text = re.sub(r"[^a-zA-Z0-9\s]", "", str(text))  # Remove Many Hwer
            cleaned_text = re.sub(r"\b[a-zA-Z]\b", "", str(text))  # Remove 1 Char Only
            cleaned_text = re.sub(r"\d+", "", str(text))  # Remove Number
            return cleaned_text

        data[CONFIG_DATA["text_column"]] = data[CONFIG_DATA["text_column"]].apply(
            replace_str
        )

        # Remove incomplete URL
        data[CONFIG_DATA["text_column"]] = (
            data[CONFIG_DATA["text_column"]]
            .replace("http://", " ")
            .replace("https://", " ")
        )

        # Remove Multiple Space
        data[CONFIG_DATA["text_column"]] = data[CONFIG_DATA["text_column"]].replace(
            r"\s+", " ", regex=True
        )

        # print("data was cleaned")
        if return_file:
            return data

    # Generate Preprocessor
    def generate_preprocessor(self, data, CONFIG_DATA=CONFIG_DATA, return_file=True):
        # clean data
        # print("ready to clean data")
        data = self.clean_data(data, CONFIG_DATA)

        # print("ready to preprocess")

        # tokenization
        def word_tokenize_wrapper(text):
            return word_tokenize(text)

        data[CONFIG_DATA["token_column"]] = data[CONFIG_DATA["text_column"]].apply(
            word_tokenize_wrapper
        )

        # Stopwords Removal (Filtering)
        list_stopwords = stopwords.words("indonesian")
        list_stopwords.extend(
            [
                "yg",
                "dg",
                "rt",
                "dgn",
                "ny",
                "d",
                "klo",
                "kalo",
                "amp",
                "biar",
                "bikin",
                "bilang",
                "gak",
                "ga",
                "krn",
                "nya",
                "nih",
                "sih",
                "si",
                "tau",
                "tdk",
                "tuh",
                "utk",
                "ya",
                "jd",
                "jgn",
                "sdh",
                "aja",
                "n",
                "t",
                "nyg",
                "hehe",
                "pen",
                "u",
                "nan",
                "loh",
                "rt",
                "&amp",
                "yah",
            ]
        )
        list_stopwords = set(list_stopwords)

        def stopwords_removal(words):
            return [word for word in words if word not in list_stopwords]

        # print("gagal stopwords")

        data[CONFIG_DATA["token_column"]] = data[CONFIG_DATA["token_column"]].apply(
            stopwords_removal
        )
        # Stemming
        factory = StemmerFactory()
        stemmer = factory.create_stemmer()

        def stemmed_wrapper(term):
            return stemmer.stem(term)

        term_dict = {}

        for document in data[CONFIG_DATA["token_column"]]:
            for term in document:
                if term not in term_dict:
                    term_dict[term] = " "

        for term in term_dict:
            term_dict[term] = stemmed_wrapper(term)

        def get_stemmed_term(document):
            return [term_dict[term] for term in document]

        # print("gagal stemming")

        data[CONFIG_DATA["token_column"]] = data[
            CONFIG_DATA["token_column"]
        ].swifter.apply(get_stemmed_term)

        # print('data was processed')
        if return_file:
            return data

    def predict(self, X):
        """Function to predict the data"""
        # Preprocess data
        # print("ready to go")
        X_clean = self.generate_preprocessor(X)  # Use generate_preprocessor
        # Convert tokenized texts to string format
        # print("ready to run model")
        X_clean = [" ".join(tokens) for tokens in X_clean[CONFIG_DATA["token_column"]]]
        # Create TF-IDF vectorizer
        tfidf_vectorizer = utils.pickle_load(CONFIG_DATA["tfidf_vectorizer_path"])
        # Transform the data into TF-IDF features
        # print("ready to tfidf")
        X_clean = tfidf_vectorizer.transform(X_clean)
        # Predict data
        # print("ready to predict")
        model = utils.pickle_load(CONFIG_DATA["best_model_path"])
        y_pred = model.predict_proba(X_clean)

        # Create Label Encoder
        label_encoder = utils.pickle_load(CONFIG_DATA["label_encoder_path"])

        # Transform class indices to original class labels
        class_labels = label_encoder.inverse_transform(
            np.arange(len(label_encoder.classes_))
        )

        # Create a list to store class labels with probabilities
        class_prob_list = []

        # Iterate through each input sample and append class label with probability to the list
        for row in y_pred:
            class_probabilities = [
                {"label": label, "probability": prob}
                for label, prob in zip(class_labels, row)
            ]
            class_prob_list.append(class_probabilities)

        # Return the list of dictionaries containing class labels and probabilities
        return class_prob_list


model = Model()


def predict_text(data):
    try:
        text = data
        if not text:
            raise HTTPException(
                status_code=400, detail="Missing 'text' field in request payload"
            )

        # Preprocess data
        input_data = pd.DataFrame({"content": [text]})
        predictions = model.predict(input_data)  # Use the model to make predictions

        # Return predictions for each class
        return {"class_probabilities": predictions}

    except Exception as e:
        raise HTTPException(status_code=400, detail=str(e))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\taqiy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\taqiy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [34]:
text = "Warga yang tinggal di jalan Taucit, Kelurahan Labuhan Deli, Kecamatan Medan Marelan kini tidak lagi mengeluhkan kondisi akses jalan yang rusak, sebab Pemko Medan melalui Dinas SDABMBK telah selesai memperbaiki jalan dengan pembetonan di jalan tersebut, Kamis (7/9/2023) Dengan selesainya perbaikan jalan ini tentu pengguna jalan terutama masyarakat yang tinggal di jalan tersebut juga akan lebih merasakan kenyamanan. Karena selama ini kondisi jalan berbatu dan berlubang, sehingga jika hujan turun jalan menjadi becek dan berlumpur yang sedikit menyulitkan pengendara kendaraan. Perbaikan dengan pembetonan di jalan Taucit ini dilakukan Dinas SDABMBK dengan panjang 307 meter dengan lebar 5- 6 meter dan ketebalan beton 25 cm serta Beton FC 30 Mpa. Pembetonan jalan ini dilakukan selain adanya permintaan dari warga yang mengeluhkan kondisi jalan, juga sebagai upaya dalam mewujudkan program prioritas Wali Kota Medan Bobby Nasution yakni Medan Tanpa Lubang. Kepala Dinas SDABMBK Kota Medan Topan Obaja Putra Ginting menjelaskan pihaknya setiap harinya terus menggenjot memperbaiki sejumlah ruas jalan. Seperti yang telah dilakukan di jalan Taucit yang kondisi jalan sebelumnya tidak nyaman dilalui oleh pengendara, kini jalan sudah dibeton. Dulunya jalan Taucit ini berlubang dan bebatuan, kini sebagai upaya mewujudkan program prioritas pak Bobby Nasution, kami lakukan perbaikan dengan pembetonan agar lebih kuat dan tidak menggunakan kenyamanan masyarakat, jelasnya."

ak = predict_text(text)
ak

Pandas Apply:   0%|          | 0/1 [00:00<?, ?it/s]

{'class_probabilities': [[{'label': 'ekonomi', 'probability': 0.18364586},
   {'label': 'hukum', 'probability': 0.14363588},
   {'label': 'ideologi', 'probability': 0.06176976},
   {'label': 'pertahanan militer', 'probability': 0.1306013},
   {'label': 'politik', 'probability': 0.29653746},
   {'label': 'sosial budaya', 'probability': 0.18380976}]]}

In [36]:
import pandas as pd
import numpy as np
from src.text_preprocessing import generate_preprocessor
import src.utils as utils
from src.model import train_model, get_best_model, get_best_threshold

CONFIG_DATA = utils.config_load()

sasa = utils.load_json(CONFIG_DATA['data_clean_path'])
sasa['category'].value_counts()

category
politik               2533
ekonomi               1946
ideologi              1640
pertahanan militer    1616
hukum                 1610
sosial budaya         1469
Name: count, dtype: int64

In [23]:
    model_record = pd.DataFrame(
        {
            "timestamp": [utils.time_stamp()],
            "model_name": ["model_name"],
            "model_params": ["model_params"],
            "threshold": ["best_threshold"],
            "metric_score": [2],
        }
    )

In [24]:
model_record

Unnamed: 0,timestamp,model_name,model_params,threshold,metric_score
0,2023-08-27 18:40:34.786738,model_name,model_params,best_threshold,2


In [22]:
dada = utils.load_json(CONFIG_DATA['model_record'])
dada

Unnamed: 0,timestamp,model_name,model_params,threshold,metric_score
0,2023-08-27 05:30:00,RandomForestClassifier,"{'criterion': 'gini', 'n_estimators': 100, 'ra...",0.1,0.2


In [27]:
dada = pd.concat([dada, model_record], axis=0, ignore_index=True)
dada

Unnamed: 0,timestamp,model_name,model_params,threshold,metric_score
0,2023-08-27 05:30:00.000000,RandomForestClassifier,"{'criterion': 'gini', 'n_estimators': 100, 'ra...",0.1,0.2
1,2023-08-27 18:40:34.786738,model_name,model_params,best_threshold,2.0


In [75]:
import pandas as pd

# Read Data
def read_data(return_file=False):
    existing_data = utils.load_json(CONFIG_DATA['data_set_path'])
    new_data = pd.read_excel(CONFIG_DATA['raw_new_dataset_path'])
    new_data_exc = new_data[~new_data[CONFIG_DATA['text_column']].isin(existing_data[CONFIG_DATA['text_column']])]
    data = pd.concat([existing_data, new_data_exc], axis=0, ignore_index=True)

    # Print data
    print('ready read file, data shape   :', new_data_exc.shape)

    # Dump data
    utils.dump_json(data, CONFIG_DATA['data_set_path'])
    utils.dump_json(new_data_exc, CONFIG_DATA['new_data_set_path'])
    # Return data
    if return_file:
        return new_data