In [None]:
import pandas as pd # used as operations for manipulating numerical tables and time series.
identity = pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')   #reading identity data
transaction = pd.read_csv('../input/ieee-fraud-detection/train_transaction.csv')  #reading the transaction data
import matplotlib.pyplot as plt #creates a figure, creates a plotting area in a figure, plots some lines in a plotting area, decorates the plot with labels, etc.
import seaborn as sns #helps you explore and understand your data.

# **Treachery Diagnosis Of Cards and Insurance claims**

### ***Working with treachery diagnosys of all types of money cards***

**INPUT DATA:**

The data is broken in two data points’ **identity** and **transaction** which are joined by *TransactionID*.

1.**Identity Table**: (Information about cards and client details)

> The Identity Table has 144233 rows and 41 columns.

> Variables in this table are identity information – network connection information (IP, ISP, Proxy, etc.) and digital signature (UA/browser/so/version, etc.) associated with transactions.

> Features:

*         DeviceType 
*         DeviceInfo 
*         Id12-id38 
*         TransactionID etc.

2.**Transaction Table:** (Information about the transaction of the clients)

> “It contains money transfer and also other gifting goods and services, like you booked a ticket for others, etc.”

> Transaction table has 590540 rows and 394 columns.

> Features:

* ProductCD 
* isFraud 
* Card1 – card6 
* TransactionID 
* addr1,addr2 
* P_emaildomain 
* R_emaildomain

> We have to merge the identity and transaction data points for our training

**Output :**
Analysis of treachery using plots


> 
**Let's first check identity table**

In [None]:
identity.head()

In [None]:
identity.info()

In [None]:
identity.shape

It is clear from the above analysis that identity table have 144233 rows and 41 columns 

**Let's check transactions table**

In [None]:
transaction.head()

In [None]:
transaction.shape

It is clear from the above analysis that transaction table have 590540 rows and 394 columns 

**Merging transaction and identity data into training data**

In [None]:
#merging transaction and identity data by using only keys from left dataframe
#in which primry key acts as TransactionId which is common in both tables, similar to a SQL left outer join.
training = transaction.merge(identity, how = 'left')  

# **Data Analysis**

## Checking the count of transactions that are fraudulent and non-fraudulent

In [None]:
#plotting bargraph for checking the count of transactions that are fraudulent and non-fraudulent
training.groupby('isFraud').count()['TransactionID'].plot(kind='bar',
          title='Distribution of Target in Train',color=('pink','purple'),                            
          figsize=(8, 5))
plt.show()

## Check the percentage of fraud by product.

In [None]:
training['ProductCD'].unique()  #checking for the unique values in ProductCD columns

*'W', 'H', 'C', 'S', 'R' are the products for which transaction takes place.*

In [None]:
#plotting bar graph to check the percentage of fraud by product
plt.figure(figsize=(12,6))
ProductCD_Analysis = (training.groupby(['isFraud'])['ProductCD']
                     .value_counts(normalize=True) #With normalize set to True , returns the relative frequency by dividing all values by the sum of values.
                     .rename('percentage')     # rename the y axis as percentage                
                     .mul(100)  #multiply by 100 for percentage
                     .reset_index() #reset the index back to the default 0, 1, 2 etc indexes.
                     .sort_values('ProductCD')) #sorts a data frame in Ascending order of ProductCD Column
sns.barplot(x="ProductCD", y="percentage", hue="isFraud", data=ProductCD_Analysis) #this plot we choose a categorical column for the x-axis and a numerical column for the y-axis, and we see that it creates a plot taking a mean per categorical column.
plt.title('Fraud % by Product')
plt.show()

From the above graph we can see that product C has the most number of fraud cases. So we can infer that if the transaction is for product C there is a high probability of transaction being fraud. Also H, R, S columns have a high number of fraud cases.

## Check the percentage of Fraud by name of the card of the company.

In [None]:
#plotting bar graph to check the percentage of fraud by card type
plt.figure(figsize=(12,6))
plt.style.use('ggplot')
plt.title('Fraud % by Card Company')
card4_Analysis = (training.groupby(['isFraud'])['card4']
                     .value_counts(normalize=True)
                     .rename('percentage')                     
                     .mul(100)
                     .reset_index()
                     .sort_values('card4'))
sns.barplot(x="card4", y="percentage", hue="isFraud", data=card4_Analysis)
plt.show()

From the above graph we can see that Discover has more fraud cases than other cards.

## Check for number of frauds by card type.

In [None]:
plt.figure(figsize=(12,6))
#plt.style.use('dark_background')
plt.title('Fraud % by Card type')
Ctype_Analysis = (training.groupby(['isFraud'])['card6']
                     .value_counts(normalize=True)
                     .rename('percentage')                     
                     .mul(100)
                     .reset_index()
                     .sort_values('card6'))
sns.barplot(x="card6", y="percentage", hue="isFraud", data=Ctype_Analysis)
plt.show()

> As we can see from the above graph that Most of the fraud cases are happening from credit cards. 
Frauds from charge cards and other cards are almost nil.

## Check for type of device used for the transaction

It has information on the type of device used for the transaction whether the transaction was carried out from a Mobile device or Laptop/Pc.

In [None]:
training['DeviceType'].value_counts()

> From the graph we can see that most of the fraud cases are from mobile devices.

## Check the percentage of fraud by the type of devices 

In [None]:
NF = training[(training['isFraud']==0)]
F = training[(training['isFraud']==1)]
fig,(m,n) = plt.subplots(1,2,figsize=(15,3))
F.groupby('DeviceType')['DeviceType'].count().sort_index().plot(kind='barh',
                                                                          color='red',
                                                   title='Fraud count by device type',
                                                                         ax=m)
NF.groupby('DeviceType')['DeviceType'].count().sort_index().plot(kind='barh',
                                                                           color='blue',
                                                                          title='Count of not fraud by device type',
                                                                          ax=n)
plt.show()

## **Treachery Diagnosis of Insurance Claims**

In [None]:
# Load Insurance Data
df_ins = pd.read_csv('../input/insurance/insurance_claims.csv')
df_ins.head(10) #Fetching first 10 data of the insurance claim dataset

In [None]:
df_ins.shape #Returns tuple of shape (Rows, columns) of dataframe/series.

In [None]:
df_ins.nunique() #returns the number of unique values for each column

In [None]:
plt.style.use('default')
ax2 = sns.countplot(x='fraud_reported', data=df_ins, hue='fraud_reported')
#Show the counts of observations in each categorical bin using bars.
#x, y: This parameter take names of variables in data or vector data, optional, Inputs for plotting long-form data.
#hue : This parameter take column name for colour encoding.
#data : This parameter take DataFrame, array, or list of arrays, Dataset for plotting. If x and y are absent, this is interpreted as wide-form. Otherwise it is expected to be long-form.

> From above plot, like most fraud datasets, the label distribution is skewed.

In [None]:
df_ins['fraud_reported'].value_counts() # Count number of frauds vs non-frauds

## Check the percentage of fraud on the basis of Incident state

In [None]:
plt.style.use('dark_background')
fig = plt.figure(figsize=(6,4)) #a tuple of the width and height of the figure in inches
ax2 = df_ins.groupby('incident_state').fraud_reported.count().plot.bar(ylim=0)
ax2.set_ylabel('Fraud reported')
plt.show()

We can observ here that most of the fraud claims are there in Ny = New york

## Check the percentage of fraud on the basis of Incident date

In [None]:
# Filter the result from incident date
plt.style.use('default')
fig = plt.figure(figsize=(15,6))
ax2 = df_ins.groupby('incident_date').total_claim_amount.count().plot.bar(ylim=0)
ax2.set_ylabel('Claim amount (in $)')
plt.show()

In [None]:
df_ins['incident_date'].value_counts()

> We can see here that fraud claims are more likely happened on 2015-02-02

## ANALYSIS OF TREACHERY REPORTED IN DISTINCT TYPES OF INSURANCE

In [None]:
plt.style.use('fivethirtyeight')
fig = plt.figure(figsize=(6,4))
ax = df_ins.groupby('incident_type').fraud_reported.count().plot.bar(ylim=0)
ax.set_xticklabels(ax.get_xticklabels(), rotation=20, ha="right")
ax.set_ylabel('Fraud reported')
plt.show()

> We can see below that the treacheries are more in multi vehicle insurance claims.

## Treachery diagnosis on gender

In [None]:
fig = plt.figure(figsize=(10,6))
ax2 = (df_ins['insured_sex'].value_counts()*100.0 /len(df_ins))\
.plot.pie(autopct='%.1f%%', labels = ['Male', 'Female'], fontsize=12)                                                                           
ax2.set_title('Treachery diagnosis on Gender (in % )')
plt.show()

> We can see that male are more likely to abuse the insurance claims