In [1]:
# Read in the data and clean up column names
import gensim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
pd.set_option('display.max_colwidth', 100)
messages = pd.read_csv('supportTicketData.csv')
# messages = messages.drop(labels = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis = 1)
messages.columns = ["text", "label"]
messages.head()

Unnamed: 0,text,label
0,connection issues with assigned address hi facing connection issues number en tried changing cab...,P1
1,cannot access hi cannot access fallowing link get blank cannot proceed can you please help with ...,P2
2,re address shown valid dear colleagues remarked name written wrong could you please be change th...,P1
3,sent tuesday critical alert following alert occurred status active data source type data source ...,P2
4,code spelling mistake hello should discover for code please can you change thanks head,P2


In [2]:
messages = messages[messages['label']!='P3']

In [3]:
# Clean data using the built in cleaner in gensim
messages['text_clean'] = messages['text'].apply(lambda x: gensim.utils.simple_preprocess(x))
messages.head()

Unnamed: 0,text,label,text_clean
0,connection issues with assigned address hi facing connection issues number en tried changing cab...,P1,"[connection, issues, with, assigned, address, hi, facing, connection, issues, number, en, tried,..."
1,cannot access hi cannot access fallowing link get blank cannot proceed can you please help with ...,P2,"[cannot, access, hi, cannot, access, fallowing, link, get, blank, cannot, proceed, can, you, ple..."
2,re address shown valid dear colleagues remarked name written wrong could you please be change th...,P1,"[re, address, shown, valid, dear, colleagues, remarked, name, written, wrong, could, you, please..."
3,sent tuesday critical alert following alert occurred status active data source type data source ...,P2,"[sent, tuesday, critical, alert, following, alert, occurred, status, active, data, source, type,..."
4,code spelling mistake hello should discover for code please can you change thanks head,P2,"[code, spelling, mistake, hello, should, discover, for, code, please, can, you, change, thanks, ..."


In [4]:
np.unique(messages['label'])

array(['P1', 'P2'], dtype=object)

In [5]:
# Encoding the label column
messages['label']=messages['label'].map({'P1':1,'P2':2})
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split (messages['text_clean'], messages['label'] , test_size=0.2)

In [6]:
# Train the word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2)

In [7]:
w2v_model.wv.index_to_key

['you',
 'for',
 'please',
 'sent',
 'have',
 'with',
 'can',
 'hi',
 'regards',
 'be',
 'hello',
 'thank',
 'error',
 'we',
 'thanks',
 'your',
 'access',
 're',
 'issue',
 'could',
 'if',
 'manager',
 'but',
 'or',
 'help',
 'when',
 'below',
 'en',
 'any',
 'kind',
 'was',
 'monday',
 'tuesday',
 'there',
 'best',
 'need',
 'issues',
 'wednesday',
 'cannot',
 'friday',
 'one',
 'by',
 'thursday',
 'problem',
 'ticket',
 'log',
 'message',
 'dear',
 'should',
 'approval',
 'request',
 'submit',
 'some',
 'engineer',
 'know',
 'leave',
 'card',
 'working',
 'following',
 'details',
 'th',
 'ext',
 'october',
 'check',
 'get',
 'senior',
 'also',
 'attached',
 'our',
 'days',
 'laptop',
 'report',
 'november',
 'let',
 'date',
 'information',
 'password',
 'sa',
 'able',
 'july',
 'change',
 'update',
 'analyst',
 'open',
 'work',
 'action',
 'trying',
 'week',
 'after',
 'high',
 'more',
 'still',
 'up',
 'server',
 'only',
 'tried',
 'contact',
 'client',
 'task',
 'into',
 'status',

In [9]:
# Find the most similar words to "cabin" based on word vectors from our trained model
w2v_model.wv.most_similar('proceed')

[('how', 0.8494408130645752),
 ('tell', 0.801354169845581),
 ('clarify', 0.7923555970191956),
 ('what', 0.7646125555038452),
 ('happening', 0.7602647542953491),
 ('instruct', 0.7393451929092407),
 ('advise', 0.723324716091156),
 ('situation', 0.7193887829780579),
 ('clue', 0.7146585583686829),
 ('confirm', 0.7126659750938416)]

In [10]:
words = set(w2v_model.wv.index_to_key )
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test])

  X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
  X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])


# Padding

In [11]:
# Compute sentence vectors by averaging the word vectors for the words contained in the sentence
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [12]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [13]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)

In [14]:
from sklearn.metrics import precision_score, recall_score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Precision: 0.622 / Recall: 0.757 / Accuracy: 0.618
