In [19]:
import pandas as pd
import numpy as np
import re, string
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical


In [20]:
# 1. Load Data
df = pd.read_csv("it_service_tickets.csv")   # change to actual file


In [21]:
df.head(10)

Unnamed: 0,Document,Topic_group
0,connection with icon icon dear please setup ic...,Hardware
1,work experience user work experience user hi w...,Access
2,requesting for meeting requesting meeting hi p...,Hardware
3,reset passwords for external accounts re expir...,Access
4,mail verification warning hi has got attached ...,Miscellaneous
5,mail please dear looks blacklisted receiving m...,Miscellaneous
6,prod servers tunneling prod tunneling va la tu...,Hardware
7,access request dear modules report report cost...,HR Support
8,reset passwords for our client and passwords c...,Access
9,direct reports missing time please action repo...,HR Support


In [22]:
# Encode labels (category mapping)
# ---------------------------
labels = df['Topic_group'].astype('category')
y = pd.get_dummies(labels).values  # one-hot encoding

In [23]:
# Save the category mapping for later use in Streamlit
category_mapping = list(labels.cat.categories)
with open("category_mapping.pkl", "wb") as f:
    pickle.dump(category_mapping, f)

In [24]:
# Clean text
# ---------------------------
texts = df['Document'].astype(str)
def clean_text(t):
    t = t.lower()
    t = t.translate(str.maketrans("", "", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"))
    return t
texts = texts.apply(clean_text)

In [25]:
# Tokenize and pad
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(texts)
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, maxlen=50)


In [26]:
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# Build and train model
model = Sequential([
    Embedding(input_dim=20000, output_dim=128),
    LSTM(64, dropout=0.3, recurrent_dropout=0.3),
    Dense(64, activation='relu'),
    Dropout(0.4),
    Dense(y.shape[1], activation='softmax')

    # Dense(len(df['Topic_group'].unique()), activation='softmax') #can also use it 
])

In [28]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=64)

Epoch 1/5
[1m598/598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 48ms/step - accuracy: 0.4760 - loss: 1.4408 - val_accuracy: 0.8009 - val_loss: 0.6120
Epoch 2/5
[1m598/598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 46ms/step - accuracy: 0.8082 - loss: 0.6267 - val_accuracy: 0.8239 - val_loss: 0.5510
Epoch 3/5
[1m598/598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 47ms/step - accuracy: 0.8507 - loss: 0.4954 - val_accuracy: 0.8337 - val_loss: 0.5190
Epoch 4/5
[1m598/598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 47ms/step - accuracy: 0.8673 - loss: 0.4270 - val_accuracy: 0.8370 - val_loss: 0.5174
Epoch 5/5
[1m598/598[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 47ms/step - accuracy: 0.8878 - loss: 0.3707 - val_accuracy: 0.8377 - val_loss: 0.5101


<keras.src.callbacks.history.History at 0x27c278eb6b0>

In [29]:
# Save model and tokenizer
# ---------------------------
model.save("ticket_classifier.h5")
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

