In [1]:
import pandas as pd
import re

In [2]:
data = pd.read_excel('data.xlsx')                       # reading data (write file_name here)

In [3]:
data = data[['Message', 'CaseNumber', 'PostType']]      # getting only required columns

In [4]:
data

Unnamed: 0,Message,CaseNumber,PostType
0,Hello! I am trying to register a product on yo...,4829287,fan
1,,4829287,fan
2,This is the error we getting - see screenshot,4829287,fan
3,"Hi Carmy, thank you for your messaging us. Sor...",4829287,brand
4,karmyaljosa@gmail.com,4829287,fan
...,...,...,...
385801,@nube_nimbus,4358328,fan
385802,@fajardoserranoana,4358328,fan
385803,@albafe90,4358328,fan
385804,@nube_nimbus,4358328,fan


In [5]:
# converting some message from float type to string
def float_to_string(msg):
    msg = str(msg)
    return msg
data['Message'] = data['Message'].apply(float_to_string)

In [6]:
# cleaning_data
def remove_noise(msg):
    msg = re.sub('(www[^\s]*) | (https?[^\s]*)', '', msg) # removing links
    msg = re.sub('([^\s]*@[^\s]+)', '', msg)              # removing usernames, emails
    msg = re.sub(r'(\S*)[\d]+[^\s]*', '', msg)            # removing numbers
    msg = re.sub(r'\s+', ' ', msg)                        # removing extra spaces
    msg = re.sub('[\n]', ' ', msg)                        # removing new lines
    msg = re.sub(r'[^\w\s]', '', msg)                     # removing punctuations
    msg = msg.encode('ascii', 'ignore').decode('ascii')   # removing emojis
    return msg
data['Message'] = data['Message'].apply(remove_noise)

In [7]:
# removing extra spaces
def clean_data(msg):
    if msg == None or msg.strip() == '' or msg.strip() == 'nan':
        return None
    msg = msg.replace('nan', '')
    msg = [i.strip().lower() for i in msg.split(' ')]
    msg = list(filter(('').__ne__, msg))                  # removing all occurence of '' in msg
    msg = ' '.join(msg)
    msg = msg.strip()
    if msg == '':
        msg = None
    return msg
data['Message'] = data['Message'].apply(clean_data)

In [8]:
# removing NA and resetting index
data.dropna(inplace = True)
data.reset_index(inplace = True, drop=True)

### Adding Context

In [9]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
nltk.download('nps_chat')
posts = nltk.corpus.nps_chat.xml_posts()

posts_text = [post.text for post in posts]

train_text = posts_text[:int(len(posts_text)*0.8)]
test_text = posts_text[int(len(posts_text)*0.2):]

vectorizer = TfidfVectorizer(ngram_range=(1,3), min_df=0.001, max_df=0.7, analyzer='word')

X_train = vectorizer.fit_transform(train_text)
X_test = vectorizer.transform(test_text)

y = [post.get('class') for post in posts]

y_train = y[:int(len(posts_text)*0.8)]
y_test = y[int(len(posts_text)*0.2):]

gb = GradientBoostingClassifier(n_estimators = 400, random_state=0)
gb.fit(X_train, y_train)
predictions_rf = gb.predict(X_test)
print(classification_report(y_test, predictions_rf))

[nltk_data] Downloading package nps_chat to /root/nltk_data...
[nltk_data]   Package nps_chat is already up-to-date!


              precision    recall  f1-score   support

      Accept       0.81      0.72      0.76       167
         Bye       0.90      0.77      0.83       155
     Clarify       0.62      0.33      0.43        24
   Continuer       0.68      0.43      0.52       115
     Emotion       0.94      0.66      0.77       868
    Emphasis       0.82      0.48      0.61       132
       Greet       0.96      0.91      0.94      1044
       Other       0.00      0.00      0.00        32
      Reject       0.88      0.69      0.77       122
   Statement       0.73      0.94      0.82      2505
      System       0.99      0.98      0.99      2279
     nAnswer       0.69      0.76      0.72        58
  whQuestion       0.90      0.87      0.88       432
     yAnswer       0.79      0.64      0.71        89
  ynQuestion       0.92      0.61      0.73       432

    accuracy                           0.86      8454
   macro avg       0.77      0.65      0.70      8454
weighted avg       0.87   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
set(y)

{'Accept',
 'Bye',
 'Clarify',
 'Continuer',
 'Emotion',
 'Emphasis',
 'Greet',
 'Other',
 'Reject',
 'Statement',
 'System',
 'nAnswer',
 'whQuestion',
 'yAnswer',
 'ynQuestion'}

In [11]:
def add_context(row):
    text = row['Message']
    context = 'customer ' if row['PostType'] == 'fan' else 'agent '
    class_ = gb.predict(vectorizer.transform([text]))[0]
    if class_ in ['ynQuestion', 'whQuestion']:
        context += 'asked '
    elif class_ == ['yAnswer', 'nAnswer']:
        context += 'replied '
    else:
        context += 'said '
    return context + text
data['Message'] = data.apply(add_context, axis=1)

### Grouping Messages according to Casenumber

In [12]:
data

Unnamed: 0,Message,CaseNumber,PostType
0,customer said hello i am trying to register a ...,4829287,fan
1,customer said this is the error we getting see...,4829287,fan
2,agent said hi carmy thank you for your messagi...,4829287,brand
3,customer said no i havent let me try now,4829287,fan
4,customer said ah through browser edge i was ab...,4829287,fan
...,...,...,...
236042,agent said oi jssica como podemos auxiliar voc,4293362,brand
236043,customer asked trabalho na philips achei estra...,4293362,fan
236044,agent said ol jssica por favor nos conte melho...,4293362,brand
236045,customer said a mi,4358427,fan


In [13]:
data_final = data.groupby('CaseNumber')['Message'].apply(lambda x : '. '.join(x)) # joining all messages acc. to CaseNumber

In [14]:
data_final = pd.DataFrame({'Message' : data_final.values, 'CaseNumber' : data_final.index})     

In [15]:
data_final

Unnamed: 0,Message,CaseNumber
0,customer said ghada hesham,1486279
1,customer said meme awad,1486349
2,customer said amina helmy hwida ashraf said. c...,1486353
3,customer said elbash mohandesa,1486426
4,customer said haba galal,1486468
...,...,...
45430,customer said awesome,4835546
45431,customer asked what is the pr manager using a ...,4835622
45432,customer asked bobotie what is that i guess th...,4835669
45433,customer asked how can i reduce razor burn. ag...,4835740


In [16]:
# removing unnecessary conversation for validation
for i in data_final.index:
    if len(data_final.loc[i, 'Message']) < 400 or len(data_final.loc[i, 'Message']) > 800:
        data_final.drop(i, inplace = True)

In [17]:
data_final.reset_index(inplace = True, drop=True)

In [18]:
data_final

Unnamed: 0,Message,CaseNumber
0,customer said please confirm if simplygo oxyge...,1541568
1,customer asked is there any update on my repla...,1552227
2,customer said is there any answer because i ha...,1942631
3,customer said hi. customer said i want to ask ...,1944452
4,customer said ok i will pick up my equipment t...,3830529
...,...,...
6287,customer said its a great brush when it works ...,4835241
6288,customer said not only is this the only even r...,4835339
6289,customer said hi i need to know if its possibl...,4835507
6290,customer asked bobotie what is that i guess th...,4835669


In [19]:
data_final.to_csv('cleaned_data.csv', index=None)   # saving cleaned_data for predictions (write file_name here)