In [4]:
# Install Required Libraries (if necessary)
!pip install tensorflow

# Import Libraries
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load Data
from google.colab import drive
drive.mount('/content/drive')
processed_data = pd.read_csv('/content/drive/MyDrive/Verizon 1 2024-2025/Model Process/FE/processed_data.csv')
categories_data = pd.read_csv('/content/drive/MyDrive/Verizon 1 2024-2025/categories.csv')
sub_categories_data = pd.read_csv('/content/drive/MyDrive/Verizon 1 2024-2025/sub-categories.csv')

# Assign Categories and Subcategories based on Description Matching
product_descriptions = processed_data['Cleaned Description'].fillna('')
category_definitions = categories_data['definition'].fillna('')
subcategory_definitions = sub_categories_data['definition'].fillna('')

# Combine descriptions for matching
all_text = pd.concat([product_descriptions, category_definitions, subcategory_definitions], axis=0)

# TF-IDF vectorizer for similarity matching
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(all_text)

# Split TF-IDF matrix for each group
product_matrix = tfidf_matrix[:len(product_descriptions)]
category_matrix = tfidf_matrix[len(product_descriptions):len(product_descriptions) + len(category_definitions)]
subcategory_matrix = tfidf_matrix[len(product_descriptions) + len(category_definitions):]

# Calculate similarity and assign best match
best_category_indices = cosine_similarity(product_matrix, category_matrix).argmax(axis=1)
best_subcategory_indices = cosine_similarity(product_matrix, subcategory_matrix).argmax(axis=1)

# Map 'name' column in categories_data and sub_categories_data to processed_data
processed_data['category'] = [categories_data.iloc[i]['name'] for i in best_category_indices]
processed_data['subcategory'] = [sub_categories_data.iloc[i]['name'] for i in best_subcategory_indices]

# Text tokenization and padding
max_vocab_size = 10000
max_sequence_length = 100
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(processed_data['Cleaned Description'])

# Convert text to sequences and pad them
X = tokenizer.texts_to_sequences(processed_data['Cleaned Description'])
X = pad_sequences(X, maxlen=max_sequence_length)

# Label encode the 'category' column using names from categories_data
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(processed_data['category'])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build the Deep Learning Model
model = Sequential([
    Embedding(max_vocab_size, 128, input_length=max_sequence_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

#  Train the Model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the Model
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)

accuracy = accuracy_score(y_test, y_pred_classes)
classification_rep = classification_report(y_test, y_pred_classes, labels=range(len(label_encoder.classes_)), target_names=label_encoder.classes_, zero_division=1)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Epoch 1/10




[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 169ms/step - accuracy: 0.0871 - loss: 2.5390 - val_accuracy: 0.1271 - val_loss: 2.5319
Epoch 2/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 134ms/step - accuracy: 0.1391 - loss: 2.4874 - val_accuracy: 0.1271 - val_loss: 2.5070
Epoch 3/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 198ms/step - accuracy: 0.1664 - loss: 2.4493 - val_accuracy: 0.1472 - val_loss: 2.4839
Epoch 4/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 132ms/step - accuracy: 0.1551 - loss: 2.4734 - val_accuracy: 0.1973 - val_loss: 2.4177
Epoch 5/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 157ms/step - accuracy: 0.2229 - loss: 2.3502 - val_accuracy: 0.1906 - val_loss: 2.4045
Epoch 6/10
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 134ms/step - accuracy: 0.2349 - loss: 2.2479 - val_accuracy: 0.2107 - val_loss: 2.3684
Epoch 7/10
[1m38/38[0m [32m━━━━━━━━

In [None]:
print(processed_data.columns)
