In [25]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

# Load CSV data
csv_file = '/content/Cleaned_Cnn.csv'  # Update with the correct file path
df = pd.read_csv(csv_file)

# Verify the column names
print(df.columns)

class TextDataset(Dataset):
    def __init__(self, dataframe, vocab, tokenizer, transform=None):
        self.titles = dataframe['title'].tolist()  # Update 'title' if needed
        self.vocab = vocab
        self.tokenizer = tokenizer
        self.transform = transform

    def __len__(self):
        return len(self.titles)

    def __getitem__(self, idx):
        title = self.titles[idx]

        if self.transform:
            title = self.transform(title)

        return title

tokenizer = get_tokenizer("basic_english")

def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

# Create a vocabulary
vocab = build_vocab_from_iterator(yield_tokens(df['title']), specials=["<unk>"])  # Update 'title' if needed
vocab.set_default_index(vocab["<unk>"])

# Load dataset
dataset = TextDataset(dataframe=df, vocab=vocab, tokenizer=tokenizer)

# Define transformations (no transform function required now, handled in the dataset class)


Index(['title', 'views_on_vd', 'time_in_minutes'], dtype='object')


In [26]:
import torch.nn.functional as F

class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.conv1 = nn.Conv2d(1, 100, (3, embed_size), padding=(2, 0))
        self.conv2 = nn.Conv2d(1, 100, (4, embed_size), padding=(3, 0))
        self.conv3 = nn.Conv2d(1, 100, (5, embed_size), padding=(4, 0))
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(300, 1)  # For binary classification or regression

    def forward(self, x):
        x = self.embedding(x)
        x = x.unsqueeze(1)  # Add channel dimension
        x1 = F.relu(self.conv1(x)).squeeze(3)
        x2 = F.relu(self.conv2(x)).squeeze(3)
        x3 = F.relu(self.conv3(x)).squeeze(3)
        x1 = F.max_pool1d(x1, x1.size(2)).squeeze(2)
        x2 = F.max_pool1d(x2, x2.size(2)).squeeze(2)
        x3 = F.max_pool1d(x3, x3.size(2)).squeeze(2)
        x = torch.cat((x1, x2, x3), 1)
        x = self.dropout(x)
        x = self.fc(x)
        return x

# Get vocabulary size
vocab_size = len(vocab)

# Define the model
model = TextCNN(vocab_size=vocab_size, embed_size=128)


In [27]:
# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=lambda x: torch.nn.utils.rnn.pad_sequence([torch.tensor(vocab(tokenizer(text)), dtype=torch.long) for text in x], batch_first=True))

for epoch in range(num_epochs):
    running_loss = 0.0
    for i, data in enumerate(dataloader, 0):
        inputs = data
        labels = torch.zeros(inputs.size(0), 1)  # Dummy labels, for demonstration

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
        if i % 10 == 9:  # Print every 10 mini-batches
            print(f"[{epoch + 1}, {i + 1}] loss: {running_loss / 10:.3f}")
            running_loss = 0.0

print('Finished Training')


[1, 10] loss: 0.436
[2, 10] loss: 0.500
[3, 10] loss: 0.355
[4, 10] loss: 0.273
[5, 10] loss: 0.199
[6, 10] loss: 0.166
[7, 10] loss: 0.116
[8, 10] loss: 0.114
[9, 10] loss: 0.072
[10, 10] loss: 0.062
Finished Training


In [28]:
# Evaluate the model
dataloader = DataLoader(dataset, batch_size=32, shuffle=False, collate_fn=lambda x: torch.nn.utils.rnn.pad_sequence([torch.tensor(vocab(tokenizer(text)), dtype=torch.long) for text in x], batch_first=True))

with torch.no_grad():
    for data in dataloader:
        inputs = data
        outputs = model(inputs)
        print(outputs)


tensor([[ 0.1183],
        [ 0.2525],
        [-0.1694],
        [ 0.0646],
        [ 0.0949],
        [ 0.2613],
        [ 0.1182],
        [-0.0815],
        [-0.4343],
        [-0.1104],
        [ 0.3966],
        [-0.1901],
        [ 0.0364],
        [-0.3817],
        [ 0.2146],
        [ 0.2757],
        [-0.0337],
        [-0.1370],
        [ 0.0744],
        [ 0.1884],
        [ 0.0013],
        [ 0.0009],
        [-0.0039],
        [-0.1689],
        [ 0.1530],
        [ 0.0200],
        [-0.1087],
        [ 0.0641],
        [ 0.0759],
        [-0.1160],
        [ 0.2003],
        [-0.1941]])
tensor([[-0.2077],
        [-0.1812],
        [ 0.4817],
        [-0.5730],
        [-0.2587],
        [ 0.1109],
        [-0.0080],
        [ 0.0435],
        [ 0.2156],
        [ 0.1000],
        [ 0.2923],
        [ 0.0740],
        [ 0.0332],
        [-0.0257],
        [ 0.1118],
        [-0.0939],
        [-0.0637],
        [ 0.1817],
        [ 0.1264],
        [ 0.0158],
        [-0