In [55]:
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

In [63]:
# Store data locally for confidentiality reasons!
data_path = "/Users/victor/Documents/Confidential Dataset/ML_NOAC_NOVA_Extraction.csv"

# Read data
df2 = pd.read_csv(data_path)

In [220]:
df = df2.copy()

In [221]:
# Drop columns with no email content
df = df.dropna(subset=['LastIncomingEmail__c'])

# Drop this column as it contains always the same string "Case" or "Contact"
df = df.drop(['attributes.type', 'Contact.attributes.type'], axis=1)

# drop the columns with only null values
df = df.drop(['Contact'], axis=1)

In [222]:
#create a new column for the email content initalized to empty string
df['LastIncomingEmailContent'] = ''

In [223]:
# convert to TYPE column to 1 if NOAC and to 0 otherwise
df['Type'] = df['Type'].apply(lambda x: 1 if x == 'NOAC' else 0)

In [224]:
#assign the number of times ; appears in each string of LastEmailCCAddress__c in the new column cc_count
df['LastEmailCCAddressCount'] = df['LastEmailCCAddress__c'].str.count(';') + 1

#fill NaN of LastEmailCCAddress__count by 0
df['LastEmailCCAddressCount'] = df['LastEmailCCAddressCount'].fillna(0)
df['LastEmailCCAddressCount'] = df['LastEmailCCAddressCount'].astype(int)

In [225]:
# check when SuppliedEmail has the same values as ContactEmail
test = df.apply(lambda row : 1 if row['SuppliedEmail'] == row['Contact.Email'] else 0, axis=1)

In [226]:
# Remove the CaseNumber from the EmailTemplateSubjectDispute__c
df['EmailTemplateSubjectDispute__c'] = df.apply(lambda row : row['EmailTemplateSubjectDispute__c'].replace("Case #" + str(row['CaseNumber']), ''), axis=1)

In [227]:
# convert Contact.attributes.url to string
df['Contact.attributes.url'] = df['Contact.attributes.url'].astype(str)

# remove /services/data/v42.0/sobjects/Contact/ from the strings in Contact.attributes.url
df['Contact.attributes.url'] = df['Contact.attributes.url'].apply(lambda x: x.replace('/services/data/v42.0/sobjects/Contact/', ''))

In [228]:
# convert Contact.attributes.url to string
df['attributes.url'] = df['attributes.url'].astype(str)

# remove /services/data/v42.0/sobjects/Contact/ from the strings in Contact.attributes.url
df['attributes.url'] = df['attributes.url'].apply(lambda x: x.replace('/services/data/v42.0/sobjects/Case/', ''))

In [229]:
# For the vast majority of the rows (85%), the columns SuppliedEmail and Contact.Email have the same value.
# When the values are different, one of them has NaN and the other has the email address.
# Therefore, we can fill the NaN of Contact.Email with the value of SuppliedEmail
df['Contact.Email'] = df['Contact.Email'].fillna(df['SuppliedEmail'])

# Then we can drop the column SuppliedEmail
df = df.drop(['SuppliedEmail'], axis=1)

In [230]:
# fill NaN of LastEmailCCAddress__c by empty string
df['LastEmailCCAddress__c'] = df['LastEmailCCAddress__c'].fillna('')

In [231]:
# Rename Topics__c in Topics
df = df.rename(columns={'Topics__c': 'Topics'})

# Rename LastIncomingEmail__c in LastIncomingEmail
df = df.rename(columns={'LastIncomingEmail__c': 'LastIncomingEmail'})

# Rename TeamName__c in TeamName
df = df.rename(columns={'TeamName__c': 'TeamName'})

# Rename RequesterEmail__c in RequesterEmail
df = df.rename(columns={'RequesterEmail__c': 'RequesterEmail'})

# Rename EmailTemplateSubjectDispute__c by EmailObject
df = df.rename(columns={'EmailTemplateSubjectDispute__c': 'EmailObject'})

# Rename LastEmailCCAddress__c by LastEmailCCAddress
df = df.rename(columns={'LastEmailCCAddress__c': 'LastEmailCCAddress'})

#Rename attributes.url by AttributesURL
df = df.rename(columns={'attributes.url': 'AttributesURL'})

#Rename Contact.attributes.url by ContactAttributesURL
df = df.rename(columns={'Contact.attributes.url': 'ContactAttributesURL'})

# Rename Contact.Email by ContactEmail
df = df.rename(columns={'Contact.Email': 'ContactEmail'})

In [232]:
# Cleaning of the LastIncomingEmail column using BeautifulSoup
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    df.at[index, "LastIncomingEmailContent"] = BeautifulSoup(row["LastIncomingEmail"], "html").text

100%|██████████| 251234/251234 [17:45<00:00, 235.83it/s]


In [233]:
# drop the column LastIncomingEmail
df = df.drop(['LastIncomingEmail'], axis=1)

In [234]:
df.head()

Unnamed: 0,CaseNumber,Type,Topics,TeamName,RequesterEmail,EmailObject,LastEmailCCAddress,AttributesURL,ContactAttributesURL,ContactEmail,LastIncomingEmailContent,LastEmailCCAddressCount
0,100124562,0,T039,US-NOR-CMA-BKG-PREFERRED,,NAM4999926A,op@easywayintl.com,5005p00002evRjHAAU,0035p00003aZAdZAAW,usa.wcportops@cma-cgm.com,\n\n\nThank you Jessica! Await your further on...,1
3,100080371,0,T036,US-BNA-CMA-CS-IMP-DIVERSIONS,,MOGELIJKE COD'S F119086 // RTM,janneke.van.leeuwen@fclmarine.nl; executivedes...,5005p00002evWsuAAE,0030N00003N771RQAR,janneke.van.leeuwen@fclmarine.nl,"\nHi Nese,\n \nInmiddels de aangepaste bl’s al...",5
4,100095395,0,T035,US-NOR-CMA-BKG-PREFERRED,tina@generallogistics.net,AMEND NAM5088900 [ ref:_00D,bookings@generallogistics.net,5005p00002evX4SAAU,0030N00002x1JqPQAU,tina@generallogistics.net,"\nHello,\nPlease SPLIT subject line booking to...",1
5,100097054,0,T847,US-NOR-CMA-BKG-SPECIALTY,bookings@delongcompany.com,FW: CMA CGM - Cut-off date cha,usa.cmaexportactive@usa.cma-cgm.com,5005p00002evXIsAAM,0030N00002tXvvIQAS,bookings@delongcompany.com,"CMA CGM - Cut-off date change at - NEW YORK, N...",1
6,100097055,0,T847,US-NOR-CMA-BKG-SPECIALTY,bookings@delongcompany.com,FW: CMA CGM - Cut-off date cha,,5005p00002evXItAAM,0030N00002tXvvIQAS,bookings@delongcompany.com,"CMA CGM - Cut-off date change at - NEW YORK, N...",0


In [235]:
# save the cleaned data 
df.to_csv('/Users/victor/Documents/Confidential Dataset/ML_NOAC_NOVA_Extraction_Cleaned.csv', index=False)

## Ideas for feature creation:

##### EmailTemplateSubjectDispute__c
1. Specific keywords: "RE" or "BookingConfirmation"

#### SuppliedEmail
1. Extract the name of the company and group the SuppliedEmail depending on the company?


1. Number of words in the email
2. Number of sentences in the email


0.0      58898
1.0      36313
2.0      20950
3.0      13868
4.0       8722
5.0       6132
6.0       3990
7.0       2748
8.0       2083
9.0       1424
10.0      1074
11.0       780
12.0       646
13.0       462
14.0       379
17.0       289
15.0       268
16.0       207
19.0       114
18.0       101
24.0        98
25.0        72
28.0        62
76.0        59
23.0        56
22.0        53
55.0        45
29.0        44
20.0        41
26.0        41
32.0        39
27.0        34
21.0        33
33.0        22
30.0        19
36.0        18
31.0        14
43.0        13
59.0        11
75.0        11
35.0        11
56.0        10
70.0         9
65.0         9
34.0         8
51.0         6
42.0         4
129.0        4
64.0         3
58.0         2
39.0         2
122.0        2
87.0         1
72.0         1
62.0         1
40.0         1
120.0        1
82.0         1
81.0         1
Name: LastEmailCCAddress__c, dtype: int64

In [35]:




# df_sample['EmailTemplateSubjectDispute__c'] = df_sample['EmailTemplateSubjectDispute__c'].apply(lambda x: 1 if x == 'NOAC' else 0)

# .str.replace('CaseNumber', '')

In [36]:
df_sample

Unnamed: 0,CaseNumber,Type,Topics__c,TeamName__c,RequesterEmail__c,EmailTemplateSubjectDispute__c,LastEmailCCAddress__c,SuppliedEmail,LastIncomingEmail__c,attributes.type,attributes.url,Contact.attributes.type,Contact.attributes.url,Contact.Email,email_content,content
239190,101990936,NOAC,T701,US-NOR-CMA-CS-STARS-VANGUARD,usa.ecgccargoreadiness@cma-cgm.com,SAX//NAM5103781 [ ref:_00D0,,usa.ecgccargoreadiness@cma-cgm.com,"<div class=""WordSection1"">\n<p class=""MsoNorma...",Case,/services/data/v42.0/sobjects/Case/5005p00002h...,Contact,/services/data/v42.0/sobjects/Contact/0035p000...,usa.ecgccargoreadiness@cma-cgm.com,,"\nEwelina,\n \nThe port cut off is today at 16..."
172026,101316477,CRFC,T036,US-NOR-CMA-EXP-BCO-3,lcarlotti@laufer.com,RE: CMA CGM - Booking confirma,usa.cmaexportactive@usa.cma-cgm.com,lcarlotti@laufer.com,CMA CGM - Booking confirmation available – NAM...,Case,/services/data/v42.0/sobjects/Case/5005p00002h...,Contact,/services/data/v42.0/sobjects/Contact/0035p000...,lcarlotti@laufer.com,,CMA CGM - Booking confirmation available – NAM...
45750,100165528,NOAC,T701,US-NOR-CMA-IMP-PREFERRED,usa.bsnyder@usa.cma-cgm.com,RE: AMC1643489,usa.bsnyder@usa.cma-cgm.com,usa.bsnyder@usa.cma-cgm.com,"<div class=""WordSection1"">\n<p class=""MsoNorma...",Case,/services/data/v42.0/sobjects/Case/5005p00002g...,Contact,/services/data/v42.0/sobjects/Contact/0030N000...,usa.bsnyder@usa.cma-cgm.com,,"\nGood afternoon Sandy, \n \nWorking on gettin..."
298610,102540443,BREQ,T035,US-NOR-CMA-EXP-PMR-CELLMARK,marge.baldwin@cellmark.com,"Re: Attn Marge, Booking Declin",usa.cmaexportactive@usa.cma-cgm.com; cynthia.r...,marge.baldwin@cellmark.com,**** THIS MESSAGE WAS TRUNCATED ****<br><br>Se...,Case,/services/data/v42.0/sobjects/Case/5005p00002h...,Contact,/services/data/v42.0/sobjects/Contact/0030N000...,marge.baldwin@cellmark.com,,**** THIS MESSAGE WAS TRUNCATED ****Sea priori...
301003,101305441,BREQ,T035,US-NOR-ANL-EXP,cmh-au-paccarsupport@craneww.com,"BOOKING REQUEST, WEB, CARR=ANL",,cmh-au-paccarsupport@craneww.com,"<div style=""font-family: Consolas, monaco, mon...",Case,/services/data/v42.0/sobjects/Case/5005p00002h...,Contact,/services/data/v42.0/sobjects/Contact/0035p000...,cmh-au-paccarsupport@craneww.com,,\n
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191276,101524469,RFCU,T036,US-NOR-CMA-IMP-SPECIALIST,jdavis@camelbak.com,RE: VISTA DAILY CALL OUTS 3-8,vtsai@bushnell.com; valeria.wright@vistaoutdoo...,jdavis@camelbak.com,"<div style=""background-color: #FFEB9C; width: ...",Case,/services/data/v42.0/sobjects/Case/5005p00002h...,,,,,\nCAUTION: This email originated from outside ...
140845,101014461,IICH,T037,US-NOR-CMA-CC-Cargo Flow,usa.wfancher@usa.cma-cgm.com,PriLevel=Normal - 2022-03-23 -,,usa.wfancher@usa.cma-cgm.com,"<p><font face=""Calibri"">Hello,</font> </p>\n<p...",Case,/services/data/v42.0/sobjects/Case/5005p00002g...,Contact,/services/data/v42.0/sobjects/Contact/0030N000...,usa.wfancher@usa.cma-cgm.com,,"Hello, \nPlease find your request below, \nBoo..."
184396,101387301,BREQ,T035,US-NOR-CMA-BKG-PREFERRED,margaret.zawadzka@vanguardlogistics.com,"BOOKING REQUEST, WEB, CARR=CMA",usa.cmaexportactive@usa.cma-cgm.com,margaret.zawadzka@vanguardlogistics.com,"<div style=""font-family: Consolas, monaco, mon...",Case,/services/data/v42.0/sobjects/Case/5005p00002h...,Contact,/services/data/v42.0/sobjects/Contact/0030N000...,margaret.zawadzka@vanguardlogistics.com,,\n
114167,100618619,NOAC,T848,US-NOR-CMA-IMP-PREFERRED,disops@dapecon.com,"Re: DAPE, Delivery ?RE: USNYC",lcancela@dapecon.com; dispatchnj@dapecon.com; ...,disops@dapecon.com,"<div dir=""ltr"">\n<div class=""gmail_default"" st...",Case,/services/data/v42.0/sobjects/Case/5005p00002g...,Contact,/services/data/v42.0/sobjects/Contact/0030N000...,disops@dapecon.com,,\nHello \n\nMy apologies for the late reply.\n...


In [34]:
#concat "AA" and "BB"


TypeError: decoding str is not supported

In [32]:
df["RequesterEmail__c"].value_counts()

ssc.bizintelligence@cma-cgm.com           6235
usa.ecgccargoreadiness@cma-cgm.com        6124
usa.cargoflowustp@usa.cma-cgm.com         2481
usa.wccargoreadiness@cma-cgm.com          2446
miamiro-northsouthservices@cma-cgm.com    2153
                                          ... 
bob.russell@chartlandco.com                  1
mbermejo@anderinger.com                      1
ssc.nlimport@cma-cgm.com                     1
smilinghomeusa2@gmail.com                    1
ssc.snakti@cma-cgm.com                       1
Name: RequesterEmail__c, Length: 21699, dtype: int64