In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Concatenate, Input
from tensorflow.keras.models import Model
import tensorflow as tf

# Loading and preprocessing the dataset
def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)
    
    # Handling missing values
    df['Ticket Description'] = df['Ticket Description'].fillna('')
    df['Customer Age'] = df['Customer Age'].fillna(df['Customer Age'].median())
    df['Customer Gender'] = df['Customer Gender'].fillna('Other')
    
    # Encoding categorical variables
    le_priority = LabelEncoder()
    df['Ticket Priority'] = le_priority.fit_transform(df['Ticket Priority'])
    
    le_ticket_type = LabelEncoder()
    df['Ticket Type'] = le_ticket_type.fit_transform(df['Ticket Type'])
    
    le_gender = LabelEncoder()
    df['Customer Gender'] = le_gender.fit_transform(df['Customer Gender'])
    
    # Text preprocessing
    texts = df['Ticket Description'].values
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    max_sequence_length = 100
    text_data = pad_sequences(sequences, maxlen=max_sequence_length)
    
    # Numerical features
    numerical_features = df[['Customer Age', 'Ticket Type']].values
    scaler = StandardScaler()
    numerical_features = scaler.fit_transform(numerical_features)
    
    # Labels
    labels = df['Ticket Priority'].values
    
    return text_data, numerical_features, labels, tokenizer, max_sequence_length, le_priority

# Building the deep learning model
def build_model(vocab_size, max_sequence_length, num_numerical_features):
    # Text input branch
    text_input = Input(shape=(max_sequence_length,))
    embedding_layer = Embedding(vocab_size, 128)(text_input)
    lstm_layer = LSTM(64)(embedding_layer)
    
    # Numerical input branch
    numerical_input = Input(shape=(num_numerical_features,))
    dense_numerical = Dense(32, activation='relu')(numerical_input)
    
    # Combine branches
    concatenated = Concatenate()([lstm_layer, dense_numerical])
    dense1 = Dense(64, activation='relu')(concatenated)
    dense2 = Dense(32, activation='relu')(dense1)
    output = Dense(4, activation='softmax')(dense2)  # 4 priority classes
    
    model = Model(inputs=[text_input, numerical_input], outputs=output)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Main execution
def main():
    # Load and preprocess data
    file_path = 'customer_support_tickets.csv'
    text_data, numerical_features, labels, tokenizer, max_sequence_length, le_priority = load_and_preprocess_data(file_path)
    
    # Split data
    X_text_train, X_text_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
        text_data, numerical_features, labels, test_size=0.2, random_state=42
    )
    
    # Build and train model
    model = build_model(vocab_size=5000, max_sequence_length=max_sequence_length, num_numerical_features=2)
    model.fit(
        [X_text_train, X_num_train], y_train,
        validation_data=([X_text_test, X_num_test], y_test),
        epochs=10, batch_size=32, verbose=1
    )
    
    scaler = StandardScaler()
    
    
    # Evaluate model
    loss, accuracy = model.evaluate([X_text_test, X_num_test], y_test, verbose=0)
    print(f'Test Accuracy: {accuracy:.4f}')
    
    # Example prediction
    sample_text = ["I'm having an issue with the product. It keeps crashing unexpectedly."]
    sample_sequence = tokenizer.texts_to_sequences(sample_text)
    sample_padded = pad_sequences(sample_sequence, maxlen=max_sequence_length)
    sample_numerical = scaler.fit_transform([[30, 0]])  # Example: age 30, ticket type 0
    prediction = model.predict([sample_padded, sample_numerical])
    predicted_priority = le_priority.inverse_transform([np.argmax(prediction)])[0]
    print(f'Predicted Priority for sample: {predicted_priority}')

if __name__ == '__main__':
    main()

Epoch 1/10
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 30ms/step - accuracy: 0.2465 - loss: 1.3890 - val_accuracy: 0.2538 - val_loss: 1.3880
Epoch 2/10
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 30ms/step - accuracy: 0.2631 - loss: 1.3842 - val_accuracy: 0.2444 - val_loss: 1.3878
Epoch 3/10
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 30ms/step - accuracy: 0.2743 - loss: 1.3836 - val_accuracy: 0.2409 - val_loss: 1.3938
Epoch 4/10
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 30ms/step - accuracy: 0.2934 - loss: 1.3785 - val_accuracy: 0.2556 - val_loss: 1.3942
Epoch 5/10
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 30ms/step - accuracy: 0.3174 - loss: 1.3656 - val_accuracy: 0.2645 - val_loss: 1.4034
Epoch 6/10
[1m212/212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 30ms/step - accuracy: 0.3736 - loss: 1.3223 - val_accuracy: 0.2503 - val_loss: 1.4467
Epoch 7/10
[1m212/212