# \## NOTE
this file takes as input the file made in data_cleaning, and creates a new file further cleaned
# \###

## Data Exploration

`Author: Andrea Zanon`

In [1]:
# numpy
import numpy as np

# pandas
import pandas as pd

# identify digits from text
from string import digits

# regular expressions
import re

# nltk, useful to clean text (may be needed to donwload files if using for first time)
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')

# detect language from text
import langid

In [2]:
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')


[nltk_data] Downloading package punkt to /Users/victor/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/victor/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /Users/victor/nltk_data...


True

In [None]:
data = pd.read_csv("/home/azanon/ML_NOAC_NOVA_Extraction.csv")
data.head()

In [5]:
# Rename LastIncomingEmail__c in LastIncomingEmail
data = data.rename(columns={"LastIncomingEmail__c": "LastIncomingEmail"})

In [3]:
# work on copy on data
df = data.copy()

In [7]:
df.isnull().sum()

CaseNumber                      0
Type                            0
Topics                       1078
TeamName                        0
RequesterEmail               5076
EmailObject                  1417
LastEmailCCAddress          90924
AttributesURL                   0
ContactAttributesURL        24360
ContactEmail                  344
LastIncomingEmailContent       17
LastEmailCCAddressCount         0
dtype: int64

In [8]:
# From LastEmailCCAddress get whether cma-cgm is cc'd or not
df['CMA_in_cc'] = df['LastEmailCCAddress'].str.find('cma-cgm') > -1

In [9]:
# Remove all numbers from emails, they are not relevant
df["LastIncomingEmailContent"] = df["LastIncomingEmailContent"].str.translate(str.maketrans('', '', digits))

In [10]:
# re.findall("\n(.*)\n", s)
# this returns the email sentence by sentence

In [None]:
df.head()

In [12]:
# drop rows that have NaN in LastIncomingEmailContent
df = df.drop(df[df["LastIncomingEmailContent"].isnull()].index)

# reindex, otherwise some indexes are missing
df = df.reset_index()

In [13]:
# new feature: how many emails are exchanged in each conversation, counting Sent:
df["CountMailsInConversation"] = [len(re.findall("Sent: .* :To:", str(df["LastIncomingEmailContent"][i]))) for i in range(len(df["LastIncomingEmailContent"]))]

In [None]:
# extract just text of the first email. I try doing that by cutting off when there is a salutation
# eliminate characters non needed and stopwords

# define lemmatizer
lemmatizer = WordNetLemmatizer()

only_text = []
for i in range(len(df["LastIncomingEmailContent"])):
    
    # get text before one of the salutations
    text = re.split('Best|Regards|Rgds|Warmest|Warmly|Looking forward|Be well|Yours Truly|Sincerely|Booking ref| \
                      From: .* <.*>' , df.loc[i, "LastIncomingEmailContent"], flags=re.IGNORECASE)[0]
    
    # remove if language is not english (replace by NaN and then remove NaN)
    if langid.classify(text) != 'en':
        only_text.append("")
        continue  # go to next iteration
    
    # remove these because we don't need them
    for ch in ['\n', '\xa0']:
        if ch in text:
            text = text.replace(ch, '')
    
    # remove stopwords
    text_tokens = word_tokenize(text)
    text = ' '.join([word for word in text_tokens if not word in stopwords.words()])
    
    # keep everything lowercase
    text = text.lower()
    
    # word lemmatization
    lemma_words = [lemmatizer.lemmatize(o) for o in text.split()]
    text = " ".join(lemma_words)
                 
    only_text.append(text)

In [None]:
# append to new dataframe column
df['only_text'] = only_text

# remove nan elements
# drop rows that have NaN in only_text
df = df.drop(df[df["only_text"].isnull()].index)

# reindex, otherwise some indexes are missing
df = df.reset_index()

In [None]:
# save file
df.to_csv('/home/azanon/ML_NOAC_NOVA_Extraction_Cleaned_New.csv', index=False)