In [None]:
import string

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import Dataset
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

## Set Seed

In [None]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

## Read Data

In [None]:
df = pd.read_csv(
    "s3://{YOUR_BUCKET}/sentence_level_data.csv",
    index_col=[0],
    storage_options={
        "key": "REDACTED",
        "secret": "REDACTED",
    }
)

### Prep Data

In [None]:
def clean_text(s: str) -> str:
    """Clean the text.

    :param s: (str)
    :return: str
    """
    return s.lower().translate(s.maketrans("", "", string.punctuation))

df["cleaned_setence"] = df["sentence"].apply(lambda x: clean_text(x))

In [None]:
nltk.download("wordnet")

lemmer = WordNetLemmatizer()

In [None]:
def lemmatize_text(s: str, lemmer: WordNetLemmatizer) -> str:
    """Lemmatize the text.

    :param s: (str)
    :param stemmer: (PorterStemmer)
    :return: (str)
    """
    return " ".join([lemmer.lemmatize(word) for word in s.split()])

In [None]:
df["lemmatized_text"] = df["cleaned_setence"].apply(lambda x: lemmatize_text(x, lemmer))

In [None]:
tfidf = TfidfVectorizer()

In [None]:
x_tfidf = tfidf.fit_transform(df["lemmatized_text"])

In [None]:
class ChatGPTDataset(Dataset):
    def __init__(self, x_tfidf: list, y: int) -> None:
        self.x_tfidf = x_tfidf
        self.y = y
        
    
    def __len__(self) -> int:
        return len(self.x_tfidf)
    
    
    def __getitem__(self, index: int) -> tuple:
        return self.x_tfidf[index], self.y[index]

In [None]:
chatgpt_dataset = ChatGPTDataset(x_tfidf.toarray(), y=df["class"].tolist())
train_indices, test_indices = train_test_split(
    list(range(0, len(chatgpt_dataset))), test_size=0.2, random_state=RANDOM_SEED
)

In [None]:
train_sampler = SubsetRandomSampler(train_indices)
test_sampler = SubsetRandomSampler(test_indices)

In [None]:
batch_size = 32
train_loader = torch.utils.data.DataLoader(
    chatgpt_dataset,
    batch_size=batch_size,
    sampler=train_sampler
)
validation_loader = torch.utils.data.DataLoader(
    chatgpt_dataset,
    batch_size=batch_size,
    sampler=test_sampler
)

### Model

In [None]:
class DenseNetwork(nn.Module):
    
    def __init__(self):
        super(DenseNetwork,self).__init__()
        self.fc1 = nn.Linear(chatgpt_dataset.x_tfidf.shape[1], 1024)
        self.drop1 = nn.Dropout(0.8)
        self.fc2 = nn.Linear(1024, 256)
        self.drop2 = nn.Dropout(0.6)
        self.fc3 = nn.Linear(256, 128)
        self.drop3 = nn.Dropout(0.4)
        self.prediction = nn.Linear(128, 2)
        
    def forward(self, x):
        
        x = F.relu(self.fc1(x.to(torch.float)))
        x = self.drop1(x)
        x = F.relu(self.fc2(x))
        x = self.drop2(x)
        x = F.relu(self.fc3(x))
        x = self.drop3(x)
        x = F.log_softmax(self.prediction(x).squeeze())
        
        return x

In [None]:
model = DenseNetwork()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [None]:
epochs = 7
losses = []
accuracies = []

for epoch in range(1, epochs + 1):
    epoch_loss = 0.0
    epoch_true = 0
    epoch_total = 0
    for data, target in train_loader:
        optimizer.zero_grad()
        outputs = model(data)

        loss = criterion(outputs, target)
        loss.backward()
    
        optimizer.step()
        
        epoch_loss += loss.item()
        
        _, pred = torch.max(outputs,dim=1)
        epoch_true = epoch_true + torch.sum(pred == target).item()
        
        epoch_total += target.size(0)
        
    losses.append(epoch_loss)
    accuracies.append(100 * (epoch_true / epoch_total))
    
    print(f"Epoch {epoch}/{epochs} finished: train_loss = {epoch_loss}, train_accuracy = {accuracies[epoch - 1]}")

In [None]:
test_true = 0
test_total = len(test_sampler)
test_loss = 0.0
with torch.no_grad():
    for data, target in validation_loader:
        outputs = model(data)
        
        loss = criterion(outputs, target).item()
        
        _,pred = torch.max(outputs, dim=1)
        
        test_true += torch.sum(pred == target).item()
        test_loss += loss
        

print(f"Validation finished: Accuracy = {round(100 * (test_true / test_total), 2)}%, Loss = {test_loss}")