# Can you spot the fraudsters?

Consider the following dataset:

In [31]:
from sklearn import datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

dataset = pd.read_csv('DLAS01_M1_WK4_LS2_A02.csv', encoding='latin1')
print(dataset.head(10))

                    CC            No   Money               City  \
0             5.11E+15  1.610000e+12  $6,871             Amlwch   
1  5294 4409 1242 9950  1.680000e+12  $7,965      North Saanich   
2             5.38E+15  1.650000e+12  $7,927             Amlwch   
3             5.53E+15  1.630000e+12  $9,525               Ajax   
4             5.57E+15  1.630000e+12  $7,008  Pont-Saint-Martin   
5             5.15E+15  1.650000e+12  $5,138             Amlwch   
6    514515 2300050725  1.650000e+12  $6,921               Ajax   
7  5288 2331 4441 1377  1.640000e+12  $7,821  Pont-Saint-Martin   
8   538649 550458 1435  1.680000e+12  $7,426             Canoas   
9   522306 558047 9692  1.650000e+12  $7,344             Canoas   

             Phone  Fraud  
0    070 2858 8300      1  
1     07588 093717      1  
2    0800 236 8313      1  
3  (0116) 834 5299      0  
4    (016977) 0477      1  
5      0500 593141      0  
6        0800 1111      0  
7    0933 242 2375      1  
8       08

In [32]:
dataset.head(10)

Unnamed: 0,CC,No,Money,City,Phone,Fraud
0,5.11E+15,1610000000000.0,"$6,871",Amlwch,070 2858 8300,1
1,5294 4409 1242 9950,1680000000000.0,"$7,965",North Saanich,07588 093717,1
2,5.38E+15,1650000000000.0,"$7,927",Amlwch,0800 236 8313,1
3,5.53E+15,1630000000000.0,"$9,525",Ajax,(0116) 834 5299,0
4,5.57E+15,1630000000000.0,"$7,008",Pont-Saint-Martin,(016977) 0477,1
5,5.15E+15,1650000000000.0,"$5,138",Amlwch,0500 593141,0
6,514515 2300050725,1650000000000.0,"$6,921",Ajax,0800 1111,0
7,5288 2331 4441 1377,1640000000000.0,"$7,821",Pont-Saint-Martin,0933 242 2375,1
8,538649 550458 1435,1680000000000.0,"$7,426",Canoas,0845 46 45,1
9,522306 558047 9692,1650000000000.0,"$7,344",Canoas,0800 852603,1


The variable CC stands for Credit Card, and No for an identifier (number). We have a dependent variable which indicates whether a transaction/obervation is fraudulent yes or no (1/0).

We would like to create a predictive model that can identify credit card fraudsters. The variables are not usable just yet.
There are a number of operations that need to be carried out:
1. Convert alphanumeric into numeric variables
2. Remove variables that have no predictive value
3. Remove outliers
4. Transform numeric variables

## Step 1: Convert alphanumeric into numeric variables

The money variable is numeric, but not in the right format. Address this in the dataset:

In [33]:
def transform_money_variable(dataset):
    # We first copy the dataset to have our own version which we can alter
    dataset_cleaner = dataset.copy()
    
    ###
    ### YOUR CODE HERE
    dataset_cleaner['Money'] = dataset_cleaner['Money'].astype(str)
    dataset_cleaner['Money'] = dataset_cleaner['Money'].str.replace('$', '')
    dataset_cleaner['Money'] = dataset_cleaner['Money'].str.replace(',', '')
    dataset_cleaner['Money'] = dataset_cleaner['Money'].astype(int)  # cast back to appropriate type
    
    ###
    
    return dataset_cleaner

Your answer will be verified below (no need for you to do anything).

In [None]:
###
### AUTOGRADER TEST - DO NOT REMOVE
###


Check your dataset to see whether the changes have been made correctly:

In [35]:
dataset_cleaner = transform_money_variable(dataset)
dataset_cleaner.head()

Unnamed: 0,CC,No,Money,City,Phone,Fraud
0,5.11E+15,1610000000000.0,6871,Amlwch,070 2858 8300,1
1,5294 4409 1242 9950,1680000000000.0,7965,North Saanich,07588 093717,1
2,5.38E+15,1650000000000.0,7927,Amlwch,0800 236 8313,1
3,5.53E+15,1630000000000.0,9525,Ajax,(0116) 834 5299,0
4,5.57E+15,1630000000000.0,7008,Pont-Saint-Martin,(016977) 0477,1


## Step 2: Remove variables that have no predictive value

Let's have a look at the number of unique values per variable:

In [36]:
for var in dataset.columns:
    print(var, ' ', len(dataset[var].unique()))

CC   97
No   9
Money   98
City   5
Phone   95
Fraud   2


Remove the variables that you think are not needed and return the cleaned dataset:

In [45]:
def remove_variables(dataset_cleaner):
    dataset_cleanest = dataset_cleaner.copy()
    
    ###
    ### YOUR CODE HERE
    dataset_cleanest = dataset_cleanest.drop(['CC','Phone'], axis=1)
    
    ###
    
    return dataset_cleanest

Your answer will be verified below (no need for you to do anything).

In [None]:
###
### AUTOGRADER TEST - DO NOT REMOVE
###


Check your dataset:

In [46]:
dataset_cleanest = remove_variables(dataset_cleaner)
dataset_cleanest.head()

Unnamed: 0,No,Money,City,Fraud
0,1610000000000.0,6871,Amlwch,1
1,1680000000000.0,7965,North Saanich,1
2,1650000000000.0,7927,Amlwch,1
3,1630000000000.0,9525,Ajax,0
4,1630000000000.0,7008,Pont-Saint-Martin,1


## Step 3: Remove outliers

Now, add a column called 'outlier' to the dataset containing a -1/1 flag for outliers, which are calculated only for the continuous variables:

In [47]:
from sklearn.neighbors import LocalOutlierFactor

def remove_outliers(dataset_trans, no_neigh, contam):
    dataset_no_out = dataset_cleanest.copy()[['No', 'Money']]  
    
    ###
    ### YOUR CODE HERE
    loc= LocalOutlierFactor(n_neighbors = no_neigh, contamination = contam)
    outliers_loc = loc.fit_predict(dataset_no_out)
    dataset_no_out['outlier'] = pd.DataFrame(outliers_loc)
    print(dataset_no_out['outlier'].head())
    print(" ")
    print(dataset_no_out['outlier'].value_counts())
    
    ###
    
    return dataset_no_out

In [49]:
dataset_no_out = remove_outliers(dataset_cleanest, 20, 0.2)
dataset_no_out.head()

0   -1
1    1
2    1
3    1
4    1
Name: outlier, dtype: int64
 
 1    80
-1    19
Name: outlier, dtype: int64


Unnamed: 0,No,Money,outlier
0,1610000000000.0,6871,-1
1,1680000000000.0,7965,1
2,1650000000000.0,7927,1
3,1630000000000.0,9525,1
4,1630000000000.0,7008,1


Your answer will be verified below (no need for you to do anything).

In [None]:
###
### AUTOGRADER TEST - DO NOT REMOVE
###


Check your dataset:

In [50]:
dataset_no_out['outlier'].value_counts()

 1    80
-1    19
Name: outlier, dtype: int64

## Step 4: Transform numeric variables

Finally, transform the numeric variables in the dataset by using standardisation. Your result should be a dataframe containing only the transformed (numeric) variables:

In [51]:
from sklearn.preprocessing import StandardScaler

def transform_numeric_variables(dataset_cleanest):
    dataset_trans = dataset_no_out.copy()
    
    ###
    ### YOUR CODE HERE
    ss = StandardScaler()

    # Note that we only retain the continuous variables
    dataset_trans = ss.fit_transform(dataset_trans[['No','Money']])
    dataset_trans = pd.DataFrame(data = dataset_trans, columns = ['No', 'Money'])
    ###
    
    return dataset_trans

Your answer will be verified below (no need for you to do anything).

In [None]:
###
### AUTOGRADER TEST - DO NOT REMOVE
###


Check your dataset:

In [53]:
dataset_trans = transform_numeric_variables(dataset_cleanest)
dataset_trans.head()

Unnamed: 0,No,Money
0,-1.716197,-0.25486
1,1.095445,0.468902
2,-0.109545,0.443762
3,-0.912871,1.500958
4,-0.912871,-0.164224
