In [1]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

In [2]:
def preprocessed_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove special characters and digits
    text = re.sub(r"[^a-zA-Z]", " ", text)
    
    # Tokenization
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Join tokens back into a single string
    preprocessed_text = " ".join(tokens)
    
    return preprocessed_text

In [3]:
# Read data 
data = pd.read_csv("Survey Data_shipping.csv", encoding='latin-1')

In [4]:
data.columns

Index(['ï»¿OCF Journey',
       'Response Ticker - Why you gave this number in the survey',
       'Why - issue Category', 'Why - Keywords/Words related to issue',
       'Why - # of Words', 'Response Ticker - What we could do better?',
       'What - Issue', 'What - Words related to issue', 'What - # of Words',
       'Application area', 'Comment_Metadata1', 'Comment_Metadata2',
       'Q1 - How likely are you to recommend UPS to a friend or colleague?',
       'Survey Date (+00:00 GMT)', 'Unnamed: 14', 'Unnamed: 15'],
      dtype='object')

In [5]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# Preprocess the text column
data["preprocessed_text"] = data["Response Ticker - Why you gave this number in the survey"].apply(preprocessed_text)

In [7]:
data['Why - issue Category'].value_counts()

Operational                   2147
Layout/Navigation/Usablity    1973
Others                         595
Pricing/Cost                   525
Support/Sales                  430
Technical/Performance          224
others                          52
Operational                      5
Name: Why - issue Category, dtype: int64

In [8]:
data['Why - issue Category'] = data['Why - issue Category'].replace({'Operational ':'Operational','others': 'Others'})

In [9]:
data['Why - issue Category'].value_counts()

Operational                   2152
Layout/Navigation/Usablity    1973
Others                         647
Pricing/Cost                   525
Support/Sales                  430
Technical/Performance          224
Name: Why - issue Category, dtype: int64

In [49]:
# Split the data into features (X) and labels (y)
X = data["preprocessed_text"]
y = data["Why - issue Category"]  

In [44]:
import numpy as np

In [48]:
len(np.unique(y))

6

In [50]:

# Perform label encoding on the class labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Vectorize the text using Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

# Pad sequences to have the same length
max_length = max([len(x) for x in X_train])
X_train = pad_sequences(X_train, maxlen=max_length)
X_test = pad_sequences(X_test, maxlen=max_length)

# Define the CNN model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(np.unique(y)), activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=16)


# Predict classes using the trained model
y_pred_probs = model.predict(X_test)
y_pred = y_pred_probs.argmax(axis=1)

# Calculate the accuracy of predicted classes
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate the prediction accuracy for each class
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
print("Classification Report:")
print(report)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.7422334172963896
Classification Report:
                            precision    recall  f1-score   support

Layout/Navigation/Usablity       0.77      0.86      0.81       395
               Operational       0.79      0.78      0.78       431
                    Others       0.55      0.36      0.43       129
              Pricing/Cost       0.77      0.82      0.79       105
             Support/Sales       0.67      0.65      0.66        86
     Technical/Performance       0.50      0.47      0.48        45

                  accuracy                           0.74      1191
                 macro avg       0.67      0.66      0.66      1191
              weighted avg       0.73      0.74      0.73      1191



# Testing model accuracy

In [105]:
test_data = pd.read_csv('CXShieldDigitalSurveys.csv')

In [106]:
test_data.head()

Unnamed: 0,Start Date (+00:00 GMT),Survey Date (+00:00 GMT),Response Type,IP Address,Progress,Duration (in seconds),Finished,Recorded Date (+00:00 GMT),Recipient Last Name,Recipient First Name,...,External Reference,Location Latitude,Location Longitude,Distribution Channel,User Language,NPS Group,Q1 - How likely are you to recommend UPS to a friend or colleague?,Q2 - To what extent did your most recent [journey experience] experience influen...,Q3 - Why?,Why - Issue
0,4/19/2023 13:17,4/19/2023 13:18,IP Address,208.127.65.83,100,60,True,4/19/2023 13:18,,,...,,38.7911,-77.5264,email,EN,Detractor,6,Increased a little,The criteria needed.,
1,4/19/2023 12:39,4/19/2023 12:40,IP Address,50.195.184.141,100,45,True,4/19/2023 12:40,,,...,,40.2116,-85.4273,email,EN,Detractor,0,Decreased a lot,"UPS never followed up, never contacted us in a...",
2,4/19/2023 12:33,4/19/2023 12:34,IP Address,166.70.168.63,100,60,True,4/19/2023 12:34,,,...,,40.0072,-111.6031,email,EN,Detractor,0,Decreased a lot,Horrible experience. Still not resolved at all...,
3,4/19/2023 12:03,4/19/2023 12:07,IP Address,173.187.41.221,100,234,True,4/19/2023 12:07,,,...,,31.1823,-83.7641,email,EN,Detractor,5,Decreased a lot,Network Server crashed and UPS will not totall...,
4,2/23/2021 14:20,2/23/2021 14:22,IP Address,71.206.130.189,100,103,True,2/23/2021 14:22,,,...,,37.463806,-77.398003,email,EN,Promoter,10,Increased a lot,"Easy process, and the UPS associate was very h...",


In [107]:
test_series = test_data[['Q3 - Why?']]

In [108]:
test_series= test_series.dropna()

In [109]:
test_series['preprocessed_text']= test_series['Q3 - Why?'].apply(preprocessed_text )

In [97]:
test_series.head()

Unnamed: 0,Q3 - Why?,preprocessed_text
0,The criteria needed.,criteria need
1,"UPS never followed up, never contacted us in a...",up never follow never contact us way never anyth
2,Horrible experience. Still not resolved at all...,horribl experi still resolv up lost item still...
3,Network Server crashed and UPS will not totall...,network server crash up total uninstal let us ...
4,"Easy process, and the UPS associate was very h...",easi process up associ help


In [98]:
test_s = test_series['preprocessed_text']

In [101]:
X_test.shape, X_train.shape, test_s.shape

((1191, 200), (4760, 200), (27905,))

In [112]:
#preprocesing
test_s = tokenizer.texts_to_sequences(test_s)

# Pad sequences to have the same length
max_length = max([len(x) for x in X_train])
test_s = pad_sequences(test_s, maxlen=max_length)

In [113]:
test_s.shape, X_train.shape

((27905, 200), (4760, 200))

In [115]:
y_pred_test = model.predict(test_s)



In [118]:
y_pred_deploy = y_pred_test.argmax(axis=1)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [122]:
test_series['predicted_class'] = y_pred_deploy

In [130]:
y_pred_deploy_labeled = label_encoder.inverse_transform(y_pred_deploy)

In [131]:
test_series['y_pred_labelled'] = y_pred_deploy_labeled

In [132]:
test_series

Unnamed: 0,Q3 - Why?,preprocessed_text,predicted_class,y_pred_labelled
0,The criteria needed.,criteria need,0,Layout/Navigation/Usablity
1,"UPS never followed up, never contacted us in a...",up never follow never contact us way never anyth,0,Layout/Navigation/Usablity
2,Horrible experience. Still not resolved at all...,horribl experi still resolv up lost item still...,0,Layout/Navigation/Usablity
3,Network Server crashed and UPS will not totall...,network server crash up total uninstal let us ...,0,Layout/Navigation/Usablity
4,"Easy process, and the UPS associate was very h...",easi process up associ help,0,Layout/Navigation/Usablity
...,...,...,...,...
34585,reliable service with the nicest UPS driver in...,reliabl servic nicest up driver world,0,Layout/Navigation/Usablity
34586,"When I have my regular UPS man Tom, everything...",regular up man tom everyth fine fantast job va...,0,Layout/Navigation/Usablity
34587,Uploading/merging a mail list into WorldShip i...,upload merg mail list worldship easi watch vid...,0,Layout/Navigation/Usablity
34588,Worldship is far too complex to use. I have b...,worldship far complex use use websit year cont...,0,Layout/Navigation/Usablity
