In [1]:
import pandas as pd # used as operations for manipulating numerical tables and time series.
identity = pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')   #reading identity data
transaction = pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')  #reading the transaction data
import numpy as np

In [2]:
#function to reduce the memory of dataset
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [3]:
#merging transaction and identity data by using only keys from left dataframe
#in which primry key acts as TransactionId which is common in both tables, similar to a SQL left outer join.
training = transaction.merge(identity, how = 'left')  

In [4]:
training.head()

In [5]:
null_columns = [col for col in training.columns if training[col].isnull().sum() / training.shape[0] > 0.9]
training.drop(null_columns,axis=1,inplace=True)
# col for col iterates over the list training.columns with the variable col and adds it to the resulting list if col is null
#analyze and drop Rows/Columns with Null values
#inplace: It is a boolean which makes the changes in data frame itself if True.
#axis: axis takes int or string value for rows/columns. Input can be 0 or 1 for Integer and ‘index’ or ‘columns’ for String.

filling the missing values with mode values for categorical features and with median for continuous features.
The columns with object dtype are the possible categorical features in your dataset.

In [6]:
#filling null alues with mean for continuous variables
for i in training.columns:
    if training[i].dtypes=='int64' or training[i].dtypes=='float64':   
        training[i].fillna(training[i].mean(),inplace=True)

In [7]:
#filling null alues with mode for categorical variables
for i in training.columns:
    if training[i].dtypes=='object':     
        training[i].fillna(training[i].mode()[0],inplace=True)

In [8]:
#The columns with object dtype are the possible categorical features in your dataset.
catagorical_cols = ['id_12','id_15', 'id_16', 'id_23', 
            'id_27', 'id_28', 'id_29','id_30', 'id_31', 'id_33', 'id_34', 'id_35', 
            'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4', 'card6', 'M4','P_emaildomain',
            'R_emaildomain', 'addr1', 'addr2', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9']

# **Data Transformation**

We are applying label encoding on all the categorical features. For example, ‘Credit_card’ can be assigned as 0, ‘Debit_Card’ can be assigned as 1 and ‘Others’ can be assigned as 2.

The fit method is calculating the mean and variance of each of the features present in our data. The transform method is transforming all the features using the respective mean and variance.

In [9]:
#Label encoder can be used to transform non-numerical labels (as long as they are hashable and comparable) to numerical labels.
# Basically label encoding is used to convert non-numerical labels such as device type , defice info to numerical labels as 0 , 1 for e.g
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in catagorical_cols:
  if i in training.columns:
    training[i] = le.fit_transform(training[i].astype(str).values) ##Fit label encoder of target values and return encoded labels.

In [10]:
y= training['isFraud']
print(y.shape)


In [11]:
x = training.drop(['isFraud','TransactionID','TransactionDT'],axis=1)
print(x.shape)

> The dataset has a highly imbalanced class as we have seen above. There are only 3.5 % fraud cases in our dataset and 96.5% non-fraud cases. We have a ratio of 96.5: 3.5 in our original dataset, we have to divide the dataset for train and test in such a way that both the classes are present in the same proportion in both train and test set. For maintaining the same proportion in the train and test set we have used stratified sampling. We are using 70% of the dataset for training and 30% for testing.

> The features are the descriptive attributes, and the label is what you're attempting to predict or forecast. Here the feature is x and label is y

> here we have to analyze our data whether it is fraud or not whch is available in isFraud so isFraud column will become label of the prediction and rest other columns will be features on which we will be analyzing our model except for transction id and transaction date which is of no relevance here so we will remove both in x.

The first subset is used to fit the model and is referred to as the training dataset. The second subset is not used to train the model; instead, the input element of the dataset is provided to the model, then predictions are made and compared to the expected values. This second dataset is referred to as the test dataset.

In [12]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,stratify = y,test_size = 0.3, random_state=1)

train_test_split Split arrays or matrices into random train and test subsets. (*arrays, test_size=None, train_size=None, random_state=None, shuffle=True, stratify=None) -> list[Any | ndarray | list]


In [13]:
print(x_train.shape,x_test.shape,y_train.shape,y_test.shape)

# **Decision Tree**

The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features. A tree can be seen as a piecewise constant approximation.

In [14]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=1) #Random_state is used to set the seed for the random generator so that we can ensure that the results that we get can be reproduced
model.fit(x_train,y_train)

In [15]:
predict = model.predict(x_test) #Predict class or regression value for X.

In [16]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,predict)

## We applied a decision tree classifier on our model and got auc_score of 0.78 which is much better then we got in logistic regression. Decision tree performed much better than logistic regression.