# LSTM for Text Classification

Di notebook ini, kita akan berkenalan dengan salah satu arsitektur *neural networks* yang sering digunakan untuk data sekuensial, yaitu **Long Short-Term Memory (LSTM)**.

## Agenda

Agenda kita hari ini:
* LSTM dengan PyTorch
* Bagaimana cara kerja LSTM

In [None]:
import re
from collections import Counter
from pathlib import Path
from string import punctuation

import numpy as np
import pandas as pd
import torch

pd.set_option("display.max_colwidth", 0)

## Datasets

In [None]:
DATA_DIR = Path("data/news-article")
DATA_FILEPATH = DATA_DIR / "news.csv"

In [None]:
df_news = pd.read_csv(DATA_FILEPATH)
df_news = df_news[~df_news["class"].isin(["international_film_tv"])]

In [None]:
df_news.sample(5)

## Data Preprocessing

In [None]:
def lowerize(df):
    df["full_text"] = df["full_text"].str.lower()
    return df


def remove_punctuation(df):
    df["full_text"] = df["full_text"].apply(
        lambda excerpt: "".join([char for char in excerpt if char not in punctuation])
    )
    return df


def remove_digits(df):
    df["full_text"] = df["full_text"].apply(
        lambda excerpt: re.sub(r"\b\d+\b", "", excerpt)
    )
    return df


df_news = df_news.pipe(lowerize).pipe(remove_punctuation).pipe(remove_digits)

In [None]:
df_news.sample(5)

In [None]:
# create dict vocab
count = Counter(" ".join(df_news["full_text"].tolist()).split())
vocab = sorted(count, key=count.get, reverse=False)
vocab_to_int = {word: i for i, word in enumerate(vocab, 1)}
int_to_vocab = {i: word for word, i in vocab_to_int.items()}
print("Number of vocab:", len(vocab))

# tokenize
news_tokens = []
for news in df_news["full_text"]:
    news_tokens.append([vocab_to_int[word] for word in news.split()])

In [None]:
df_news.loc[1, "full_text"]

In [None]:
print(news_tokens[1])

In [None]:
" ".join([int_to_vocab[token] for token in news_tokens[1]])