In [4]:
import os
import numpy as np
import pandas as pd
import torch
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

dataset_path = '../resources/dataset.csv'

data = pd.read_csv(dataset_path)

data.head()

Unnamed: 0,text,category
0,نظم عهد شرق لفن عرض فنا تحت عنو بقة الف وذل سع...,Culture
1,تقم فنن ليت كابيلو عرض طلع عام دبي يضم عرض لوح...,Culture
2,وصل يلة سير تحد تعة ءثر نفس يرق لقب شعر ملي نس...,Culture
3,عقد ظهر ءمس ءول قصر ثقف شرق جلس ءخر جلس لقى ءو...,Culture
4,خار صحف يمز جورج ءورويل يحل رتب قءم تضم ءعظم خ...,Culture


In [5]:
# Separate texts and labels
texts = data['text'].values
labels = data['category'].values

# Split data into training and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Tokenize texts for Word2Vec training
train_tokens = [text.split() for text in train_texts]

# Train Word2Vec model on training tokens
w2v_params = {"vector_size": 300, "window": 5, "min_count": 5, "workers": 7}
w2v_model = Word2Vec(sentences=train_tokens, **w2v_params)

# Create a tokenizer by mapping words to integer IDs using Word2Vec vocabulary
tokenizer = {word: idx + 1 for idx, word in enumerate(w2v_model.wv.index_to_key)}

# Set maximum sequence length based on the longest training sequence
max_sequence_length = max(len(tokens) for tokens in train_tokens)

# Convert texts to sequences of integers and apply padding
train_sequences = [[tokenizer.get(word, 0) for word in text.split()] for text in train_texts]
test_sequences = [[tokenizer.get(word, 0) for word in text.split()] for text in test_texts]

# Pad sequences to ensure consistent input length and convert to PyTorch tensors
train_data = torch.tensor([seq + [0] * (max_sequence_length - len(seq)) for seq in train_sequences], dtype=torch.long)
test_data = torch.tensor([seq + [0] * (max_sequence_length - len(seq)) for seq in test_sequences], dtype=torch.long)

# Encode labels as integers
label_encoder = LabelEncoder()
train_labels = torch.tensor(label_encoder.fit_transform(train_labels), dtype=torch.long)
test_labels = torch.tensor(label_encoder.transform(test_labels), dtype=torch.long)

# Define the number of classes
num_classes = len(label_encoder.classes_)


In [6]:
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(texts, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Create Dataset and DataLoader
train_dataset = TextDataset(train_data, train_labels)
test_dataset = TextDataset(test_data, test_labels)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

  self.texts = torch.tensor(texts, dtype=torch.long)
  self.labels = torch.tensor(labels, dtype=torch.long)


In [8]:
# Create embedding matrix
embedding_dim = w2v_params["vector_size"]
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

# Convert embedding matrix to PyTorch tensor
embedding_matrix = torch.tensor(embedding_matrix, dtype=torch.float32)

AttributeError: 'dict' object has no attribute 'word_index'

In [None]:
model_save_path = 'Backend/Models/CNN'

# Create the directory if it doesn't exist
os.makedirs(model_save_path, exist_ok=True)


# Define the file path for saving the Word2Vec model
w2v_model_file_path = os.path.join(model_save_path, 'word2vec.model')

# Save the Word2Vec model
w2v_model.save(w2v_model_file_path)

print(f"Word2Vec model saved to {w2v_model_file_path}")

In [None]:
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, max_len, num_classes, embedding_matrix=None):
        super(TextCNN, self).__init__()

        # Embedding layer
        if embedding_matrix is not None:
            self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # Convolutional layer and pooling
        self.conv = nn.Conv1d(embedding_dim, 128, kernel_size=3)
        self.pool = nn.AdaptiveMaxPool1d(1)

        # Fully connected layers
        self.fc1 = nn.Linear(128, 64)
        self.fc2 = nn.Linear(64, num_classes)

        # Dropout layer
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.embedding(x).permute(0, 2, 1)  # Reshape for Conv1d
        x = self.conv(x)
        x = self.pool(x).squeeze(2)
        x = self.dropout(torch.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

# Initialize model
model = TextCNN(vocab_size=len(tokenizer.word_index) + 1,
                embedding_dim=embedding_dim,
                max_len=max_sequence_length,
                num_classes=7,
                embedding_matrix=embedding_matrix)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


In [None]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for texts, labels in train_loader:
        texts = texts.to(device, dtype=torch.long)
        labels = labels.to(device, dtype=torch.long)  

        # Forward pass
        outputs = model(texts)  
        # Compute loss
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(train_loader):.4f}")


In [None]:
model.eval()  
all_labels = []
all_predictions = []
with torch.no_grad():
    for texts, labels in test_loader:
        texts = texts.to(device)
        labels = labels.to(device)

        outputs = model(texts)
        _, predicted = torch.max(outputs, 1)
        all_labels.extend(labels.cpu().numpy())
        all_predictions.extend(predicted.cpu().numpy())

# Calculate metrics
test_accuracy = accuracy_score(all_labels, all_predictions)
test_precision = precision_score(all_labels, all_predictions, average="macro")
test_recall = recall_score(all_labels, all_predictions, average="macro")
test_f1 = f1_score(all_labels, all_predictions, average="macro")

print(f"Test Accuracy: {test_accuracy}")
print(f"Test Precision: {test_precision}")
print(f"Test Recall: {test_recall}")
print(f"Test F1 Score: {test_f1}")


In [None]:
# Define path
model_save_path = '/Backend/Models/CNN'

# Create the directory if it doesn't exist
os.makedirs(model_save_path, exist_ok=True)

# Define file path for model
model_file_path = os.path.join(model_save_path, 'cnn_model.pth')

# Save model
torch.save(model, model_file_path)

print(f"Model saved to {model_file_path}")
