# **Fraud Detection**

In [None]:

import numpy as np
import pandas as pd 

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
import xgboost as xgb
import lightgbm as lgb
import catboost
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
import matplotlib.gridspec as gridspec
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

import gc
gc.enable()

import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        



In [None]:
pd.set_option('display.max_rows', 500)

In [None]:
#folder_path = '../input/ieee-fraud-detection/'
ind = 'TransactionID'
train_identity = pd.read_csv('train_identity.csv')
train_transaction = pd.read_csv('train_transaction.csv')
test_identity = pd.read_csv('test_identity.csv')
test_transaction = pd.read_csv('test_transaction.csv')

train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

Data is splitted in two data sets called Identity and Transaction. Each of these two sets has a column named TransactionID. We merged our two data sets on that column.

In [None]:
train_transaction.head()

In [None]:
train_identity.head()

In [None]:
test_transaction.head()

In [None]:
train_identity.shape, train_transaction.shape

# Data


In this competition you are predicting the probability that an online transaction is fraudulent, as denoted by the binary target ```isFraud```.

The data is broken into two files **identity** and **transaction**, which are joined by ```TransactionID```. 

> Note: Not all transactions have corresponding identity information.

**Categorical Features - Transaction**

- ProductCD
- emaildomain
- card1 - card6
- addr1, addr2
- P_emaildomain
- R_emaildomain
- M1 - M9

**Categorical Features - Identity**

- DeviceType
- DeviceInfo
- id_12 - id_38

**The TransactionDT feature is a timedelta from a given reference datetime (not an actual timestamp).**

**Files**

- train_{transaction, identity}.csv - the training set
- test_{transaction, identity}.csv - the test set (**you must predict the isFraud value for these observations**)
- sample_submission.csv - a sample submission file in the correct format


In [None]:
print('train_transaction has shape',train_transaction.shape)
print('train_identity has shape',train_identity.shape)
print('test_transaction has shape',test_transaction.shape)
print('test_identity has shape',test_identity.shape)

As we can see **test dataset** and **train dataset** have similar number of samples. We will deal with a lot of columns in transaction dataset that have many unknown values and many features that we dont know what they are.

Important
> **The TransactionDT feature is a timedelta from a given reference datetime (not an actual timestamp).**

[The timespan of the dataset is 1 year ?
](https://www.kaggle.com/c/ieee-fraud-detection/discussion/100071#latest-577632) by Suchith**

```
Train: min = 86400 max = 15811131
Test: min = 18403224 max = 34214345
```

The difference train.min() and test.max() is ```x = 34214345 - 86400 = 34127945``` but we don't know is it in seconds,minutes or hours.

```
Time span of the total dataset is 394.9993634259259 days
Time span of Train dataset is  181.99920138888888 days
Time span of Test dataset is  182.99908564814814 days
The gap between train and test is 30.00107638888889 days
```

If it is in seconds then dataset timespan will be ```x/(3600*24*365) = 1.0821``` years which seems reasonable to me. So if the **transactionDT** is in **seconds** then

```
Time span of the total dataset is 394.9993634259259 days
Time span of Train dataset is  181.99920138888888 days
Time span of Test dataset is  182.99908564814814 days
The gap between train and test is 30.00107638888889 days
```


In [None]:
train_transaction['TransactionDT']

In [None]:
fig = plt.figure(figsize = (12,8))
ax = fig.gca()

plt.hist(train['TransactionDT'], label='train',bins=100);
plt.hist(test['TransactionDT'], label='test',bins=100);
plt.legend();
plt.title('Distribution of Transaction Dates');

   

If the assumption is right that we got data for 394 days, more precisely for a year and 30 days in between, then we can see spikes in the same part of the year.
With help of discussions, it would seem that data set starts wit 01.12. and ends with 31.12.

In [None]:
del train_identity,train_transaction,test_identity,test_transaction
gc.collect()

Reducing the memory of numerics

In [None]:
def red_mem_usage(df, verbose=True):
    num = ['int16','int32','int64','float16','float32','float64']
    start_mem = df.memory_usage(deep = True).sum()/1024**2
    for col in df.columns:
        col_types = df[col].dtypes
        if col_types in num:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_types)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                else:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem=df.memory_usage(deep = True).sum()/1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df
    
    

In [None]:
red_mem_usage(train)
red_mem_usage(test)

## EDA

Let's inspect our data set! First thing that we are going to do is inspect how many missing and unique values are there.

In [None]:
def analize(df):
    analysis = pd.DataFrame(df.dtypes,columns=['d_types'])
    analysis = analysis.reset_index()
    analysis = analysis.rename(columns={"index": "Col_name"})
    analysis['Missing_values'] = df.isnull().sum().values
    analysis['Unique_values'] = df.nunique().values
    return analysis

### Quick overview of a whole data set

In [None]:
def overview(df):
    for col, values in df.iteritems():
        num_uniques = values.nunique()
        print ('{name}: {num_unique}'.format(name=col, num_unique=num_uniques))
        print (values.unique())
        print ('\n')
    return

## Numerical Data Analysis

### Transaction amount 

Let's inspect Transaction amounts first and see if transaction amount has anything to do with fraud. My assumption is that bigger the transaction amount the bigger the chance there is that the transaction is a fraud, but on the other hand maybe we can make a different argument. Intuitively speaking, we can think that maybe someone would make a small fraud because of the fear of getting caught. Data will say more.

In [None]:
train[['TransactionAmt']].describe()

As we can see the result of a describe method wasn't exactly as we would liked it to be because there is no mean and std results. That happened because dtype of a column is float16. So we do this:

In [None]:
train[['TransactionAmt']].astype('float32').describe()

In [None]:
plt.figure(figsize=(16,12))
plt.suptitle('Transaction Amount Distributions', fontsize=18)

plt.subplot(221)
d = sns.distplot(train['TransactionAmt'])
d.set_title("Transaction Amount Distribuition", fontsize=18)
d.set_xlabel("")
d.set_ylabel("Probability", fontsize=15)


plt.subplot(222)
d1 = sns.distplot(train[train['TransactionAmt'] <= 1000]['TransactionAmt'])
d1.set_title("Transaction Amount Distribuition <= 1000", fontsize=18)
d1.set_xlabel("")
d1.set_ylabel("Probability", fontsize=15)

plt.subplot(223)
l = sns.distplot(np.log(train['TransactionAmt']),color='r')
l.set_title("Transaction Amount (Log) Distribuition", fontsize=18)
l.set_xlabel("")
l.set_ylabel("Probability", fontsize=15)

plt.subplot(224)
l1 = sns.distplot(np.log(train[train['TransactionAmt']<=1000]['TransactionAmt']),color='r')
l1.set_title("Transaction Amount (Log) Distribuition <= 1000", fontsize=18)
l1.set_xlabel("")
l1.set_ylabel("Probability", fontsize=15)

plt.figure(figsize=(16,12))


In [None]:

plt.figure(figsize=(16,12))
plt.suptitle('Transaction Amount Distributions, isFraud==1', fontsize=18)

plt.subplot(221)
d = sns.distplot(train[train['isFraud']==1]['TransactionAmt'])
d.set_title("Transaction Amount Distribuition", fontsize=18)
d.set_xlabel("")
d.set_ylabel("Probability", fontsize=15)

plt.subplot(222)
d = sns.distplot(np.log(train[train['isFraud']==1]['TransactionAmt']),color='r')
d.set_title("Transaction Amount (Log) Distribuition", fontsize=18)
d.set_xlabel("")
d.set_ylabel("Probability", fontsize=15)



In [None]:
plt.figure(figsize=(16,12))
plt.suptitle('Transaction Amount Distributions, isFraud==0', fontsize=18)

plt.subplot(221)
d = sns.distplot(train[train['isFraud']==0]['TransactionAmt'],color='b')
d.set_title("Transaction Amount Distribuition", fontsize=18)
d.set_xlabel("")
d.set_ylabel("Probability", fontsize=15)

plt.subplot(222)
d = sns.distplot(np.log(train[train['isFraud']==0]['TransactionAmt']),color='r')
d.set_title("Transaction Amount (Log) Distribuition", fontsize=18)
d.set_xlabel("")
d.set_ylabel("Probability", fontsize=15)

In [None]:
train[train['isFraud']==0]['TransactionAmt'].values.mean()

In [None]:
train[train['isFraud']==1]['TransactionAmt'].values.mean()

In [None]:
train[train['isFraud']==0]['isFraud'].count()/train['isFraud'].count()

As we can see from the code above, data is imbalanced. There is 96.5% data samples with no fraud and only 3.5% with fraud. 

### dist1, dist2

We don't know what exactly this 2 numerical features are. They could be distances between billing address, zip code, IP address, phone area...

In [None]:
analize(train[['dist1','dist2']])

In [None]:
print("{0:.2f}".format((352271/590541)*100),'% of missing values in dist1 column and',"{0:.2f}".format((552913/590541)*100),'% of missing values in dist 2 column')

In [None]:
plt.figure(figsize=(16,12))
plt.suptitle('dist1 and dist2',fontsize=18)

a=train[['dist1']].dropna(axis=0)
b=train[['dist2']].dropna(axis=0)

plt.subplot(221)
d = sns.distplot(a,color='b')
d.set_title("dist1 Distribuition", fontsize=18)
d.set_xlabel("")
d.set_ylabel("Probability", fontsize=15)

plt.subplot(222)
d2 = sns.distplot(b,color='r')
d2.set_title("dist2 Distribution", fontsize=18)
d2.set_xlabel("")
d2.set_ylabel("Probability", fontsize=15)


### C1-C14

In discussion described as follows:

> C1-C14: counting, such as how many addresses are found to be associated with the payment card, etc. The actual meaning is masked.
“Can you please give more examples of counts in the variables C1-15? Would these be like counts of phone numbers, email addresses, names associated with the user? I can't think of 15.
Your guess is good, plus like device, ipaddr, billingaddr, etc. Also these are for both purchaser and recipient, which doubles the number.”

In [None]:
analize(train[['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14']])

In [None]:
overview(train[['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14']])

In [None]:
train[['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13','C14']].astype('float32').describe()

### D1-D15

The real meaning behind D and C features is masked and it is hard to find a real meaning of each feature. We can only take a guess for some of the feature's meaning.

> D1-D15: timedelta, such as days between previous transaction, etc.

In [None]:
analize(train[['D1','D2','D3','D4','D5','D6','D7','D8','D9','D10','D11','D12','D13','D14','D15']])

A lot of missing values. Is there any reason behind?

In [None]:
train[['D1','D2','D3','D4','D5','D6','D7','D8','D9','D10','D11','D12','D13','D14','D15']][:20]

We can clearly see some patterns (in most cases):
* When D1==0 D2==Nan and when D1==x, x>0, D2==x
* Same thing with D3 and D5, but happens less often
   

In [None]:
train[['D1','D2','D3','D4','D5','D6','D7','D8','D9','D10','D11','D12','D13','D14','D15']].astype('float32').describe()

From the kernel:

[EDA what's behind D features?](https://www.kaggle.com/akasyanama13/eda-what-s-behind-d-features)

We can see that D3 feature indicates days from the previous transaction.

### V1-V339

> Vxxx: Vesta engineered rich features, including ranking, counting, and other entity relations.
“For example, how many times the payment card associated with a IP and email or address appeared in 24 hours time range, etc.”
"All Vesta features were derived as numerical. some of them are count of orders within a clustering, a time-period or condition, so the value is finite and has ordering (or ranking). I wouldn't recommend to treat any of them as categorical. If any of them resulted in binary by chance, it maybe worth trying."

In [None]:
pd.set_option('display.max_columns',400)
v_col = [c for c in train if c[0] == 'V']
train[v_col].head()

A lot of ones and Nan's.

### id1-id11

> “id01 to id11 are numerical features for identity, which is collected by Vesta and security partners such as device rating, ip_domain rating, proxy rating, etc. Also it recorded behavioral fingerprint like account login times/failed to login times, how long an account stayed on the page, etc. All of these are not able to elaborate due to security partner T&C. I hope you could get basic meaning of these features, and by mentioning them as numerical/categorical, you won't deal with them inappropriately.”



In [None]:
id_col = ['id_01','id_02','id_03','id_04','id_05','id_06','id_07','id_08','id_09','id_10','id_11']
train[id_col].astype('float32').describe()

The thing that is bugging me is why are there so many negative values, because by the description of the Vesta company this features seem all to be positive. Let try to analize some general stuff and aybe come to this later.

In [None]:
analize(train[id_col])

In [None]:
plt.figure(figsize=(35, 12))
features = list(train[id_col])
uniques = [len(train[col].unique()) for col in features]
sns.set(font_scale=1.2)
ax = sns.barplot(features, uniques, log=True)
ax.set(xlabel='Feature', ylabel='log(unique count)', title='Number of unique values per feature TRAIN')
for p, uniq in zip(ax.patches, uniques):
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 10,
            uniq,
            ha="center") 

In [None]:
test.columns

In [None]:
analize(test)

In [None]:
train.head()

In [None]:
for i in range(39):
    if i<9:
        test=test.rename(columns={"id-0"+str(i+1): "id_0"+str(i+1)})
    test=test.rename(columns={"id-"+str(i+1): "id_"+str(i+1)})
test.head()

In [None]:
plt.figure(figsize=(35, 12))
features = list(test[id_col])
uniques = [len(test[col].unique()) for col in features]
sns.set(font_scale=1.2)
ax = sns.barplot(features, uniques, log=True)
ax.set(xlabel='Feature', ylabel='log(unique count)', title='Number of unique values per feature TEST')
for p, uniq in zip(ax.patches, uniques):
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
            height + 10,
            uniq,
            ha="center") 

Both in test set and training set column with the highest number of unique values is id_02. We have less missing values in test set.

## Categorical features

Let's start analyzing categorical features. We will start with id categorical features.

In [None]:
l=[]
for i in range(12,39):
    l.append('id_'+str(i))
train[l].head()

In [None]:
n=[i for i in l if train[i].dtype=='float16']
train[n]=train[n].astype('float32')
train[n].describe()


In [None]:
c = [k for k in l if train[k].dtype=='object']
train[c].describe()

In [None]:
analize(train[l])

Some of the features have a lot of unique values (look more like numerical data). 

In [None]:
overview(train[l])

In [None]:
for i in n:
    try:
        train.set_index('TransactionDT')[i].plot(style='.', title=i, figsize=(15, 3))
        test.set_index('TransactionDT')[i].plot(style='.', title=i, figsize=(15, 3))
        plt.show()
    except TypeError:
        pass

We can see some repetative behaviour (first month resembles the last one).

In [None]:
cols = ['TransactionDT'] + n
plt.figure(figsize=(15,15))
sns.heatmap(train[cols].corr(), cmap='RdBu_r', annot=True, center=0.0)
plt.title('ID')
plt.show()

encode all categorical values

In [None]:
enc_c = ['id_12','id_15','id_16','id_27','id_28','id_29','id_34','id_35','id_36','id_37','id_38']
nenc_c=[k for k in c if k not in enc_c]
dc = {'Unknown':-1,'NotFound':0,'Found':1,'New':2,'F':0,'T':1,'match_status:2':2, 'match_status:1':1, 'match_status:0':0, 'match_status:-1':-1}
for i in enc_c:
    train[i]=train[i].map(dc)

In [None]:
cols = ['TransactionDT'] + enc_c
plt.figure(figsize=(15,15))
sns.heatmap(train[cols].corr(), cmap='RdBu_r', annot=True, center=0.0)
plt.title('ID')
plt.show()

In [None]:
for i in nenc_c:
    plt.figure(figsize=(80,30))

    train[i]=train[i].fillna('Missing')
    features = list(train[i].unique()[:20])
    #if you want to see 10 most frequent values 
    #features = train['DeviceInfo'].value_counts()[:10].index.tolist()
    uniques = [(train[i]==col).sum() for col in features]
    sns.set(font_scale=2)
    ax = sns.barplot(features,uniques, log=True)
    ax.set(xlabel='Feature', ylabel='log(unique count)', title=i)
    for p, uniq in zip(ax.patches, uniques):
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height + 10,
                uniq,
                ha="center") 

### DeviceType,DeviceInfo and ProductCD

> ProductCD: product code, the product for each transaction
“Product isn't necessary to be a real 'product' (like one item to be added to the shopping cart). It could be any kind of service.”

In [None]:
overview(train[['ProductCD']])

In [None]:
plt.figure(figsize=(30,15))
i='ProductCD'
#train[i]=train[i].fillna('Missing')
features = list(train[i].unique())
uniques = [(train[i]==col).sum() for col in features]
sns.set(font_scale=1)
ax = sns.barplot(features,uniques, log=True)
ax.set(xlabel='Feature', ylabel='log(unique count)', title=i)
for p, uniq in zip(ax.patches, uniques):
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
                height + 10,
                uniq,
                ha="center") 

It is hard to find out what exactly these features mean. 

In [None]:
overview(train[['DeviceType']])

In [None]:
plt.figure(figsize=(30,15))
i='DeviceType'
train[i]=train[i].fillna('Missing')
features = list(train[i].unique())
uniques = [(train[i]==col).sum() for col in features]
sns.set(font_scale=1)
ax = sns.barplot(features,uniques, log=True)
ax.set(xlabel='Feature', ylabel='log(unique count)', title=i)
for p, uniq in zip(ax.patches, uniques):
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
                height + 10,
                uniq,
                ha="center") 

A lot of data missing.

In [None]:
overview(train[['DeviceInfo']])

In [None]:
plt.figure(figsize=(30,15))
i='DeviceInfo'
train[i]=train[i].fillna('Missing')
features = train['DeviceInfo'].value_counts()[:10].index.tolist()
uniques = [(train[i]==col).sum() for col in features]
sns.set(font_scale=1)
ax = sns.barplot(features,uniques, log=True)
ax.set(xlabel='Feature', ylabel='log(unique count)', title=i)
for p, uniq in zip(ax.patches, uniques):
    height = p.get_height()
    ax.text(p.get_x()+p.get_width()/2.,
                height + 10,
                uniq,
                ha="center") 

In [None]:
train['DeviceInfo'].value_counts()[:10].index.tolist()

Still a lot of missing data but as expected windows is most frequently used. 

### card1-card6 and M1-M9

> card1 - card6: payment card information, such as card type, card category, issue bank, country, etc.

> M1-M9: match, such as names on card and address, etc.

In [None]:
new = ['card1','card2','card3','card4','card5','card6','M1','M2','M3','M4','M5','M6','M7','M8','M9']
for i in new:
    plt.figure(figsize=(25,10))

    train[i]=train[i].fillna('Missing')
    features = list(train[i].unique()[:20])
    #if you want to see 10 most frequent values 
    #features = train['DeviceInfo'].value_counts()[:10].index.tolist()
    uniques = [(train[i]==col).sum() for col in features]
    sns.set(font_scale=2)
    ax = sns.barplot(features,uniques, log=True)
    ax.set(xlabel='Feature', ylabel='log(unique count)', title=i)
    for p, uniq in zip(ax.patches, uniques):
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height + 10,
                uniq,
                ha="center") 

### addr1, addr2, P_emaildomain and R_emaildomain

I find this 4 features important because I think they will play big role in our predictive model. Why do I think that? Because I find them very interesting and think that we can extract some key insights from them about some fraud "patterns" and maybe connect them to some other features.

>addr: address; 
>“both addresses are for purchaser: addr1 as billing region
addr2 as billing country”"

> P_ and (R_) emaildomain: purchaser and recipient email domain; “ certain transactions don't need recipient, so Remaildomain is null.”

In [None]:
overview(train[['addr1','addr2']])

In [None]:
analize(train[['addr1','addr2']])

So as we can see we have 332 billing regions and 74 billing countries.

In [None]:
train[['addr1','addr2']].describe()

In [None]:
(train['addr2']==87).sum()

I find this very interesting. Could it be that value 87 == USA? 

In [None]:
analize(train[['P_emaildomain','R_emaildomain']])

In [None]:
overview(train[['P_emaildomain','R_emaildomain']])

10 most frequently used emails

In [None]:
for i in ['P_emaildomain','R_emaildomain']:
    plt.figure(figsize=(25,10))

    train[i]=train[i].fillna('Missing')
    #features = list(train[i].unique()[:20])
    #if you want to see 10 most frequent values 
    features = train[i].value_counts()[:10].index.tolist()
    uniques = [(train[i]==col).sum() for col in features]
    sns.set(font_scale=2)
    ax = sns.barplot(features,uniques, log=True)
    ax.set(xlabel='Feature', ylabel='log(unique count)', title=i+' most frequent email adresses')
    for p, uniq in zip(ax.patches, uniques):
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height + 10,
                uniq,
                ha="center") 

10 least frequently used emails

In [None]:
for i in ['P_emaildomain','R_emaildomain']:
    plt.figure(figsize=(25,10))

    train[i]=train[i].fillna('Missing')
    #features = list(train[i].unique()[:20])
    #if you want to see 10 most frequent values 
    features = train[i].value_counts()[-10:].index.tolist()
    uniques = [(train[i]==col).sum() for col in features]
    sns.set(font_scale=2)
    ax = sns.barplot(features,uniques, log=True)
    ax.set(xlabel='Feature', ylabel='log(unique count)', title=i+' least frequent email adresses')
    for p, uniq in zip(ax.patches, uniques):
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height + 10,
                uniq,
                ha="center") 

In [None]:
for i in ['P_emaildomain','R_emaildomain']:
    plt.figure(figsize=(25,10))

    #train[i]=train[i].fillna('Missing')
    #features = list(train[i].unique()[:20])
    #if you want to see 10 most frequent values 
    features = (train[train.iloc[:]['addr2']== 87]['P_emaildomain']).value_counts(sort=True)[:10].index.tolist()
    uniques = [(train[i]==col).sum() for col in features]
    sns.set(font_scale=2)
    ax = sns.barplot(features,uniques, log=True)
    ax.set(xlabel='Feature', ylabel='log(unique count)', title=i+' with addr2==87')
    for p, uniq in zip(ax.patches, uniques):
        height = p.get_height()
        ax.text(p.get_x()+p.get_width()/2.,
                height + 10,
                uniq,
                ha="center") 

Unfortunately, it doesn't tell me anything special. Second interesting method some used in this competition was looking at the decimal places of the transaction amount and then looking at those mails. Why is this a smart approach? Because they saw that rows that had 3 or more decimal places were linked to non-USA emails due to the exchange of the currencies.