In [None]:
#lstm based language model for text generation using pytorch 

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder



#load data
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)
df = df.rename(columns={'v1':'label', 'v2':'text'})
df['label'] = df['label'].map({'ham':0, 'spam':1})
df.head()

#data preprocessing
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    return text

df['text'] = df['text'].apply(lambda x: clean_text(x))
df.head()

#split data
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#vectorize data
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

#convert to torch tensors
X_train = torch.from_numpy(X_train.toarray()).float()
X_test = torch.from_numpy(X_test.toarray()).float()
y_train = torch.from_numpy(y_train.to_numpy()).float()
y_test = torch.from_numpy(y_test.to_numpy()).float()

#build model
class LSTM(nn.Module):
    
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(LSTM, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

#hyperparameters
input_size = 6293
hidden_size = 128
num_layers = 2
num_classes = 1
learning_rate = 0.001
num_epochs = 10
batch_size = 64

#device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#load data
train = torch.utils.data.TensorDataset(X_train, y_train)
test = torch.utils.data.TensorDataset(X_test, y_test)
train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=True)

#initialize model
model = LSTM(input_size, hidden_size, num_layers, num_classes).to(device)

#loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

#train model
total_step = len(train_loader)
for epoch in range(num_epochs):

    for i, (data, targets) in enumerate(train_loader):
        data = data.to(device)
        targets = targets.to(device)
        targets = targets.view(-1, 1)

        #forward
        scores = model(data)
        loss = criterion(scores, targets)

        #backward
        optimizer.zero_grad()
        loss.backward()

        #gradient descent or adam step
        optimizer.step()

#calculate perplexity
def perplexity(model, data_loader):

    model.eval()
    total_loss = 0
    total_count = 0

    with torch.no_grad():
        for data, targets in data_loader:
            data = data.to(device)
            targets = targets.to(device)
            targets = targets.view(-1, 1)

            scores = model(data)
            loss = criterion(scores, targets)
            total_loss += loss.item() * data.size(0)
            total_count += data.size(0)

    loss = total_loss / total_count
    ppl = np.exp(loss)

    return ppl

#calculate accuracy
def accuracy(model, data_loader):

    model.eval()
    total_correct = 0
    total_count = 0

    with torch.no_grad():
        for data, targets in data_loader:
            data = data.to(device)
            targets = targets.to(device)
            targets = targets.view(-1, 1)

            scores = model(data)
            predictions = scores > 0
            total_correct += (predictions == targets).sum()
            total_count += targets.size(0)

    acc = total_correct / total_count

    return acc
    