In [3]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning) 

import re
from datetime import datetime
from collections import Counter
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
from category_encoders import TargetEncoder, CatBoostEncoder

In [4]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
targets = train["label"].values

train.drop(["Unnamed: 0", "label"], inplace=True, axis=1)
test.drop(["Unnamed: 0"], inplace=True, axis=1)

print(train.shape, test.shape)

(80176, 12) (34365, 12)


In [5]:
categorical_features = ["org", "tld", "mail_type"]
binary_features = ["ccs", "bcced", "salutations", "designation"]
numerical_features = ["images", "urls", "chars_in_subject", "chars_in_body"]

target = "target"

### Processing the date feature

In [6]:
def parsedate(s : str):
    if s=='11-MAR-2018 20:40:58':
        d = ' '.join(['11 Mar 2018', '20:40:58', '+0000'])
    else:
        s = s.replace('0580', '0530')
        s = s.replace('-0000', '+0000')
        tz = re.search('[+-]{1}[0-9]{4}|$', s).group()
        if tz=='':
            tz='+0000'
        date = re.search('[0-9]{1,2}\s+[A-Za-z]{3}\s+[0-9]{2,4}|$', s).group()
        time = re.search('[0-9]{2}:[0-9]{2}:[0-9]{2}|$', s).group()
        d= ' '.join([date, time, tz])
    try:
        res = datetime.strptime(d, '%d %b %Y %H:%M:%S %z')
    except Exception as e:
        res = datetime.strptime(d, '%d %b %y %H:%M:%S %z')
    return res

train["date"] = train["date"].map(lambda d: parsedate(d))
test["date"] = test["date"].map(lambda d: parsedate(d))

### Adding date features

In [7]:
train["year"] = train["date"].map(lambda d: d.year)
test["year"] = test["date"].map(lambda d: d.year)

train["month"] = train["date"].map(lambda d: d.month)
test["month"] = test["date"].map(lambda d: d.month)

train["weekday"] = train["date"].map(lambda d: d.weekday())
test["weekday"] = test["date"].map(lambda d: d.weekday())

#train["weekend"] = train["weekday"].map(lambda n: int(n > 5))
#test["weekend"] = train["weekday"].map(lambda n: int(n > 5))

train["hour"] = train["date"].map(lambda d: d.hour)
test["hour"] = test["date"].map(lambda d: d.hour)

train["minute"] = train["date"].map(lambda d: d.minute)
test["minute"] = test["date"].map(lambda d: d.minute)

train["second"] = train["date"].map(lambda d: d.second)
test["second"] = test["date"].map(lambda d: d.second)

train['timezone'] = train['date'].apply(lambda x: x.tzname())
test['timezone'] = test['date'].apply(lambda x: x.tzname())

categorical_features.append("timezone")

### Cleaning categorical variables

In [8]:
for categorical_feature in categorical_features:
    train[categorical_feature] = train[categorical_feature].fillna("missing")
    test[categorical_feature] = test[categorical_feature].fillna("missing")

    train[categorical_feature] = train[categorical_feature].map(lambda t: t.strip().lower() 
                                                                if type(t) != float 
                                                                else t)
    
    test[categorical_feature] = test[categorical_feature].map(lambda t: t.strip().lower() 
                                                              if type(t) != float 
                                                              else t) 

### Adding counts for categorical variables

for categorical_feature in categorical_features:
    
    vc = train[categorical_feature].value_counts()
    mean_count = vc.mean()
    vc = vc.to_dict()
    
    train[f"{categorical_feature}_count"] = train[categorical_feature].map(vc)
    test[f"{categorical_feature}_count"] = test[categorical_feature].map(lambda k: vc[k] if k in vc 
                                                                         else mean_count)
                                                                         


### Adding new features

In [9]:
train["full_content_length"] = train["chars_in_body"] + train["chars_in_subject"]
test["full_content_length"] = test["chars_in_body"] + test["chars_in_subject"]

train["cc_bcc"] = train["bcced"] + train["ccs"]
test["cc_bcc"] = test["bcced"] + test["ccs"]

for f in ["chars_in_subject", "full_content_length"]:
    train[f].fillna(train[f].median(), inplace=True)
    test[f].fillna(train[f].median(), inplace=True)

### Encoding  the categorical features

In [10]:
method = "target"
assert method in ["label", "target", "onehot", "catboost"]

if method == "label":
    for categorical_feature in categorical_features:
        train[categorical_feature] = train[categorical_feature].fillna("missing")
        test[categorical_feature] = test[categorical_feature].fillna("missing")
        
    for categorical_feature in categorical_features:
        le = LabelEncoder()
        le.fit(train[categorical_feature].tolist() + test[categorical_feature].tolist())
        train[categorical_feature] = le.transform(train[categorical_feature])
        test[categorical_feature] = le.transform(test[categorical_feature])
        
elif method == "target":
    target_encoder = TargetEncoder(cols=categorical_features)
    train = target_encoder.fit_transform(train, targets)
    test = target_encoder.transform(test)
    
elif method == "catboost":
    catboost_encoder = CatBoostEncoder(cols=categorical_features)
    train = catboost_encoder.fit_transform(train, targets)
    test = catboost_encoder.transform(test)

elif method == "onehot":
    for categorical_feature in categorical_features:
        train[categorical_feature] = train[categorical_feature].fillna("missing")
        test[categorical_feature] = test[categorical_feature].fillna("missing")
    
    for categorical_feature in categorical_features:
        
        dummy = pd.get_dummies(pd.concat([train, test], axis=0)[categorical_feature], 
                               prefix=f"{categorical_feature}__")
        dummy.reset_index(inplace=True, drop=True)
        
        dummy_train = dummy.loc[:len(train)-1]
        dummy_test = dummy.loc[len(train):]
        dummy_test.reset_index(inplace=True, drop=True)
        
        train = pd.concat([train, dummy_train], axis=1)
        test = pd.concat([test, dummy_test], axis=1)
        

    binary_columns = [c for c in train.columns if "__" in c]
    train.drop(categorical_features, inplace=True, axis=1)
    test.drop(categorical_features, inplace=True, axis=1)
    
    svd = TruncatedSVD(n_components=50)
    svd.fit(train[binary_columns])
    
    print("explained ratio :", svd.explained_variance_ratio_.sum())
    
    reduced_binary_train = svd.transform(train[binary_columns])
    reduced_binary_test = svd.transform(test[binary_columns])
    
    train.drop(binary_columns, inplace=True, axis=1)
    test.drop(binary_columns, inplace=True, axis=1)
    
    train = pd.concat([train, pd.DataFrame(reduced_binary_train)], axis=1)
    test = pd.concat([test, pd.DataFrame(reduced_binary_test)], axis=1)    

### Saving processed data

In [11]:
train["label"] = targets

In [12]:
train.drop("date", inplace=True, axis=1)
test.drop("date", inplace=True, axis=1)

In [13]:
print(train.shape, test.shape)

(80176, 21) (34365, 20)


In [42]:
train.to_csv(f"../data/processed/{method}/train.csv", index=False)
test.to_csv(f"../data/processed/{method}/test.csv", index=False)