# \## NOTE
this file takes as input the file made in data_cleaning, and creates a new file further cleaned
# \###

## Data Exploration

`Author: Andrea Zanon`

In [1]:
import numpy as np
import pandas as pd
from string import digits
import re

In [34]:
data = pd.read_csv("/home/azanon/ML_NOAC_NOVA_Extraction_Cleaned.csv")
data.head()

Unnamed: 0,CaseNumber,Type,Topics,TeamName,RequesterEmail,EmailObject,LastEmailCCAddress,AttributesURL,ContactAttributesURL,ContactEmail,LastIncomingEmailContent,LastEmailCCAddressCount
0,100124562,0,T039,US-NOR-CMA-BKG-PREFERRED,,NAM4999926A,op@easywayintl.com,5005p00002evRjHAAU,0035p00003aZAdZAAW,usa.wcportops@cma-cgm.com,\n\n\nThank you Jessica! Await your further on...,1
1,100080371,0,T036,US-BNA-CMA-CS-IMP-DIVERSIONS,,MOGELIJKE COD'S F119086 // RTM,janneke.van.leeuwen@fclmarine.nl; executivedes...,5005p00002evWsuAAE,0030N00003N771RQAR,janneke.van.leeuwen@fclmarine.nl,"\nHi Nese,\n \nInmiddels de aangepaste bl’s al...",5
2,100095395,0,T035,US-NOR-CMA-BKG-PREFERRED,tina@generallogistics.net,AMEND NAM5088900 [ ref:_00D,bookings@generallogistics.net,5005p00002evX4SAAU,0030N00002x1JqPQAU,tina@generallogistics.net,"\nHello,\nPlease SPLIT subject line booking to...",1
3,100097054,0,T847,US-NOR-CMA-BKG-SPECIALTY,bookings@delongcompany.com,FW: CMA CGM - Cut-off date cha,usa.cmaexportactive@usa.cma-cgm.com,5005p00002evXIsAAM,0030N00002tXvvIQAS,bookings@delongcompany.com,"CMA CGM - Cut-off date change at - NEW YORK, N...",1
4,100097055,0,T847,US-NOR-CMA-BKG-SPECIALTY,bookings@delongcompany.com,FW: CMA CGM - Cut-off date cha,,5005p00002evXItAAM,0030N00002tXvvIQAS,bookings@delongcompany.com,"CMA CGM - Cut-off date change at - NEW YORK, N...",0


In [35]:
# work on copy on data
df = data.copy()

In [36]:
df.isnull().sum()

CaseNumber                      0
Type                            0
Topics                       1078
TeamName                        0
RequesterEmail               5076
EmailObject                  1417
LastEmailCCAddress          90924
AttributesURL                   0
ContactAttributesURL        24360
ContactEmail                  344
LastIncomingEmailContent       17
LastEmailCCAddressCount         0
dtype: int64

In [37]:
# From LastEmailCCAddress get whether cma-cgm is cc'd or not
df['CMA_in_cc'] = df['LastEmailCCAddress'].str.find('cma-cgm') > -1

In [38]:
# Remove all numbers from emails, they are not relevant
df["LastIncomingEmailContent"] = df["LastIncomingEmailContent"].str.translate(str.maketrans('', '', digits))

In [7]:
# re.findall("\n(.*)\n", s)
# this returns the email sentence by sentence

In [39]:
df.head()

Unnamed: 0,CaseNumber,Type,Topics,TeamName,RequesterEmail,EmailObject,LastEmailCCAddress,AttributesURL,ContactAttributesURL,ContactEmail,LastIncomingEmailContent,LastEmailCCAddressCount,CMA_in_cc
0,100124562,0,T039,US-NOR-CMA-BKG-PREFERRED,,NAM4999926A,op@easywayintl.com,5005p00002evRjHAAU,0035p00003aZAdZAAW,usa.wcportops@cma-cgm.com,\n\n\nThank you Jessica! Await your further on...,1,False
1,100080371,0,T036,US-BNA-CMA-CS-IMP-DIVERSIONS,,MOGELIJKE COD'S F119086 // RTM,janneke.van.leeuwen@fclmarine.nl; executivedes...,5005p00002evWsuAAE,0030N00003N771RQAR,janneke.van.leeuwen@fclmarine.nl,"\nHi Nese,\n \nInmiddels de aangepaste bl’s al...",5,True
2,100095395,0,T035,US-NOR-CMA-BKG-PREFERRED,tina@generallogistics.net,AMEND NAM5088900 [ ref:_00D,bookings@generallogistics.net,5005p00002evX4SAAU,0030N00002x1JqPQAU,tina@generallogistics.net,"\nHello,\nPlease SPLIT subject line booking to...",1,False
3,100097054,0,T847,US-NOR-CMA-BKG-SPECIALTY,bookings@delongcompany.com,FW: CMA CGM - Cut-off date cha,usa.cmaexportactive@usa.cma-cgm.com,5005p00002evXIsAAM,0030N00002tXvvIQAS,bookings@delongcompany.com,"CMA CGM - Cut-off date change at - NEW YORK, N...",1,True
4,100097055,0,T847,US-NOR-CMA-BKG-SPECIALTY,bookings@delongcompany.com,FW: CMA CGM - Cut-off date cha,,5005p00002evXItAAM,0030N00002tXvvIQAS,bookings@delongcompany.com,"CMA CGM - Cut-off date change at - NEW YORK, N...",0,False


In [69]:
# drop rows that have NaN in LastIncomingEmailContent
df = df.drop(df[df["LastIncomingEmailContent"].isnull()].index)

# reindex, otherwise some indexes are missing
df = df.reindex(range(df.shape[0]))

In [104]:
# new feature: how many emails are exchanged in each conversation, counting Sent:
df["CountMailsInConversation"] = [len(re.findall("Sent: .* :To:", str(df["LastIncomingEmailContent"][i]))) for i in range(len(df["LastIncomingEmailContent"]))]

In [167]:
# extract just text of the first email. I try doing that by cutting off when there is a salutation
# eliminate characters non needed and stopwords
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#nltk.download('punkt')
#nltk.download('stopwords')

only_text = []
for i in range(len(df["LastIncomingEmailContent"])):
    # get text before one of the salutations
    text = re.split('Best|Regards|Rgds|Warmest|Warmly|Looking forward|Be well|Yours Truly|Sincerely|Booking ref' \
                        , df.loc[i, "LastIncomingEmailContent"], flags=re.IGNORECASE)[0]
    
    # remove these because we don't need them
    for ch in ['\n', '\xa0']:
        if ch in text:
            text = text.replace(ch, '')
    
    # remove stopwords
    text_tokens = word_tokenize(text)
    text = ' '.join([word for word in text_tokens if not word in stopwords.words()])
                 
    only_text.append(text)