# Support Email Classification Project part 1


## This notebook Contains the script to clean the raw text data into the readable format, Input is a dataframe with Columns as Request type and the Email body to be cleaned, output is a dataframe with cleaned email body

### NOTE: To use this script the column name should be *'Request Type'* and *'Content'* (else rename them to above mentioned format) and also, drop other columns  

## Input Text Example

##### *Input Text*: &lt;!DOCTYPE html PUBLIC &quot;-//W3C//DTD XHTML 1.0 Transitional//EN&quot; &quot;http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd&quot;&gt;&#13;&#10;&lt;html&gt;&#13;&#10;    &lt;head&gt;&#13;&#10;        &lt;meta http-equiv=&quot;Content-Type&quot; content=&quot;text/html; charset=UTF-8&quot; /&gt;&#13;&#10;        &lt;title&gt;Booking reference number 280209715774&lt;/title&gt;&#13;&#10;&#9;&lt;/head&gt;&#13;&#10;    &lt;body&gt;&#13;&#10;    Hi,&lt;br /&gt;&#13;&#10;&lt;br /&gt;&#13;&#10;I have booked a flight for 4 passengers, booking reference number 280209715774. However the flight date is incorrect (I had selected 08.03.2020, but the date on the booking is now showing for 08.04.2020), I would like to change the date to the 08.03.2020 for all the passengers for the same timings. Please can you assist me, as the website keeps crashing and I am unable to do this online. &lt;br /&gt;&#13;&#10;&lt;br /&gt;&#13;&#10;You can contact me on +91 93183 69980 &lt;br /&gt;&#13;&#10;&lt;br /&gt;&#13;&#10;Kind Regards, &lt;br /&gt;&#13;&#10;Sukhvinder K Dhanjal&#13;&#10;    &lt;/body&gt;&#13;&#10;&lt;/html&gt;

## Cleaned Output Text Example

public transitional quot www org content type quot content text title booking reference number title body book flight passenger book reference number flight date incorrect select date booking show change date passenger timing assist website keep crash unable regard sukhvinder

In [4]:
#DATA CLEANING
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import spacy
from nltk.corpus import stopwords
import nltk 
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['\n'])
#from nltk.stem.snowball import SnowballStemmer
import re
from nltk.tokenize.treebank import TreebankWordDetokenizer
from gensim.utils import simple_preprocess
import gensim
import copy

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Provide your input dataframe below

In [5]:
support_email_data= pd.read_csv(r'/content/drive/MyDrive/final_support_data.csv',encoding= 'latin-1')
#support_email_data= pd.read_csv(r'/content/drive/MyDrive/CustomerService_01Dec2019_23March2020.csv',encoding= 'latin-1')

In [9]:
support_email_data.head()

Unnamed: 0,ï»¿Sales Inquiry Status,Sub Sales Enquiry,Interaction Id,Request_Type,Request Sub Type,Subject,content
0,Resolved,Resolved on Email,24769821,Refunds,Refund not initiated/processed,About refunds,"<div dir=""auto"">Greting of the dayÂ <div dir=""..."
1,Resolved,Resolved on Email,24769820,Refunds,Refund - Direct Airline Cancellation,Refund issues,&lt;div dir=&quot;auto&quot;&gt;&lt;div style=...
2,Dupe complaint,Dupe complaint,24769819,Others,Call Disconnected,Cancellation of ticket,&lt;html&gt;&#13;&#10;&lt;head&gt;&#13;&#10;&l...
3,Resolved,Resolved on Email,24769816,Cancellation,Cancellation Charges,Fwd: Yatra MyBookings,"<div><div dir=""auto"">Hi team,</div></div><div ..."
4,,,24769812,,,Refund confirmation,"<div dir=""auto"">Dear team,<div dir=""auto"">Â Ia..."


# Cleaning Script
## Steps:
### 1. Removing null rows
### 2. Beautiful soup and unescape are used to remove HTML part of the string 
### 3. Merging minority classes(Domestic, Ecash, Self D, Special service) to others
### 4. Formed strings to tokens, followed by the steps of basic cleaning, stop  words removal, lemmatization and then returned the detokenized string 

In [11]:
#removing rows will null mail content and request type
support_email_data = support_email_data.rename({'Request Type':'Request_Type'}, axis=1)
support_email_data = support_email_data[support_email_data['Content'].notna()] 
support_email_data = support_email_data[support_email_data['Request_Type'].notna()] 
support_email_data = support_email_data.reset_index(drop=True)

#Semi cleaning code for cleaning Content 
#Code to remove the garbage text and extracting out the required string
from bs4 import BeautifulSoup
from html import unescape

support_email_data['semi_clean_content'] = '' #Adding a column for clean text to the dataframe 
for i in range(len(support_email_data)):
    toxic_string= support_email_data['Content'][i]
    html_str= unescape(toxic_string)
    soup= BeautifulSoup(html_str)
    clean_str= soup.get_text()
    support_email_data['semi_clean_content'][i]= clean_str

#Removing rows with null semi clean content
support_email_data = support_email_data[support_email_data['semi_clean_content'].notna()] 
support_email_data = support_email_data.reset_index(drop=True)

# Assigning the 4 minority to Others
def merging_minority(df):
    for i in range(len(df)):
        if(df['Request_Type'][i] =='Domestic'):
           df['Request_Type'][i] ='Others' 
        elif(df['Request_Type'][i] =='Ecash'):
           df['Request_Type'][i] ='Others' 
        elif(df['Request_Type'][i] =='Self D'):
           df['Request_Type'][i] ='Others' 
        elif(df['Request_Type'][i] =='Special service'):
           df['Request_Type'][i] ='Others' 
    return df

#Assignining random class content to others class
def merging_minority_2(df):
    for i in range(len(df)):
        if(df['Request_Type'][i]!='Refunds' and df['Request_Type'][i]!='Eticket/Voucher' and df['Request_Type'][i]!='Booking information' and df['Request_Type'][i]!='Cancellation' and df['Request_Type'][i]!='Others' and df['Request_Type'][i]!='Amendment' and df['Request_Type'][i]!='Promotions' and df['Request_Type'][i]!='Booking Query' and df['Request_Type'][i]!='Website Error'):
            df['Request_Type'][i]='Others'
    return df

support_email_data_9_col = copy.deepcopy(support_email_data) 
support_email_data_9_col = merging_minority(support_email_data_9_col)
support_email_data_9_col = merging_minority_2(support_email_data_9_col)


#Choosing required columns of the dataframe 
support_email_data_9_col= support_email_data_9_col[['Request_Type','semi_clean_content','Content']]
#support_email_data_9_col.isnull().sum()

#Cleaning Phase 2
#Tokenising and cleaning the dataset 
# Convert email body to list
data = support_email_data_9_col.semi_clean_content.values.tolist() #Data is a list of strings 

# tokenize - break down each sentence into a list of words
import gensim
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

#Calling above function
data_words = list(sent_to_words(data))       
################################################################################
#Customly cleaning the dataset of tokens
def data_cleaning(transactions):
    #DATA CLEANING(REMOVING NAN, SPEACIAL CHARS AND NUMS, SINGLE CHARS, LEADING AND TRAILING SPACES)
    for i in range(len(transactions)):
        transactions[i] = [x for x in transactions[i] if x != '']  #Removing empty strings from the Lists of transactins
        transactions[i] = [re.sub('[^a-zA-Z]+', ' ', _) for _ in transactions[i]] #Removing Speacial chars and nums
        transactions[i]=  [re.sub(r'\b\w{1,2}\b','', k) for k in transactions[i]] #Removing single characters
        transactions[i] = [re.sub("^\s+|\s+$", "", j, flags=re.UNICODE) for j in transactions[i]] # Removing both leading and trailing spaces
    #CONVERTING ELEMENTS OF LIST TO LOWER CASE
    transactions= [[x.lower() for x in subl] for subl in transactions]
    #transactions = [subl for subl in transactions if len(subl) > 1]
    transactions= [list(filter(None, sublist)) for sublist in transactions]
    return transactions

#Calling above function 
data_words_cleaned= data_cleaning(data_words)
################################################################################
# REMOVE stop_words and lemmatize
# remove stop_words, make bigrams and lemmatize
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words_cleaned)
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_nostops, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

#Adding a column for the cleaned text 
from nltk.tokenize.treebank import TreebankWordDetokenizer
support_email_data_9_col_cleaned = pd.DataFrame(columns=['cleaned_body'], index=support_email_data_9_col.index)
for i in range(0, len(data_lemmatized)):
    support_email_data_9_col_cleaned['cleaned_body'][i]= TreebankWordDetokenizer().detokenize(data_lemmatized[i])

support_email_data_9_col['cleaned_body'] = support_email_data_9_col_cleaned['cleaned_body'].values

#Removing rows with null cleaned_body and null Request_Type
support_email_data_9_col = support_email_data_9_col[support_email_data_9_col['cleaned_body'].notna()] 
support_email_data_9_col = support_email_data_9_col[support_email_data_9_col['Request_Type'].notna()] 
support_email_data_9_col.reset_index(drop=True, inplace=True)

support_email_data_9_col= support_email_data_9_col[['Request_Type','cleaned_body']]
support_email_data_9_col.to_csv('cleaning_script_testing.csv', index= False)