In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import missingno as msno  ##pip install missingno

pd.set_option('display.max_rows', 500) # your numbers here
pd.set_option('display.max_columns', 500)


### **Read File**

In [2]:
null_df = pd.read_csv('Data/car_v2.csv')

### **Initial count of Rows and Column**

In [None]:
null_df.shape

### **Are there duplicate rows?**

In [None]:
null_df.duplicated()


### **Count of null at the Columns level **

### There are 127 rows and some rows with null values. Many of these null rows have a count of only nine nulls 

In [None]:
null_df.isnull().sum().count()

In [None]:
#null_df.isnull().sum()
null_df.isnull().sum().sort_values(ascending=False)

In [None]:
# Let's find out how missing data do we have

# First, let's count the number of null values
total = null_df.isnull().sum().sort_values(ascending=False)
# Then, let's calculate the percentage of missing data per feature
percent = (null_df.isnull().sum()/null_df.isnull().count()).sort_values(ascending=False)
# Finally, let's concatenate Total and Percent into another dataframe
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(30)


### **More Analysis**



### Find a pattern: Are there several columns with null values at the same index rows?  

###  Retreive the row number for the solitary row which has "Number Employees" as null

In [None]:
null_columns=null_df.columns[null_df.isnull().any()] 
# Step one - Assign null_columns

In [None]:
print(null_df[null_df["Number Employees"].isnull()][null_columns])  

# Step two - 
#Are there other columns with null value at the same index than sample column "Number Employees"
#The answer is Yes. It appears to be several colums with nine null value at the same index as seen in the results below.

### Removing null values at indexes = 268, 282, 313, 390, 3127, 3194, 4175, 4176, 4380?  

In [None]:
#Testing null value at index row= 282
null_df.loc[282]['Number Employees']

In [None]:
null_df.shape

In [3]:
filtered_null_step1= null_df[null_df['Number Employees'].notnull()]

In [None]:
#filtered_null_step1.loc[282]['Number Employees']

In [None]:
filtered_null_step1.shape
# AS seenm, we removed a total of nine rows. Our dataset shape is now 5542 rowns (previously 5551 rows)

In [None]:
filtered_null_step1.isnull().sum().sort_values(ascending=False)

### Dropping Columns containing >80% of nulls

In [4]:
null_df = filtered_null_step1.columns[filtered_null_step1.isna().any()].tolist()

In [None]:
#List of columns containing nulls
null_df

### Dropping Columns containing >80% of nulls

filtered_null_step2 = filtered_null_step1.dropna(thresh=500, axis=1)  #Keep only the rows with at least 500 non-na values

In [5]:
filtered_null_step2 = filtered_null_step1.dropna(thresh=0.8*len(filtered_null_step1), axis=1)

In [None]:
#filtered_null_step2 = filtered_null_step1.loc[:, filtered_null_step1.isnull().sum() < 0.8*filtered_null_step1.shape[0]]

In [None]:
filtered_null_step2.shape

In [None]:
filtered_null_step2.isnull().sum().sort_values(ascending=False)

In [None]:
#How to create a list with columns containing nulls
#null_df_1 = filtered_null_step2.columns[filtered_null_step2.isna().any()].tolist()

In [None]:
#null_df_1

In [None]:
#msno.matrix(filtered_null_step2)

### Fill out Columns containing nulls > 500 with its mean value

In [None]:
filtered_null_step2["Credit Loss Prov to Chargeoffs"].describe()

In [None]:
filtered_null_step2["Loan Loss Allow to noncurr Loans"].describe()

In [None]:
#This vizualization usually crashes. I will pause it for now.
#fig = plt.figure(figsize=(30,18))
#sns.heatmap(filtered_null_step2.isnull(),yticklabels=False,cbar=False,cmap='viridis')


In [None]:
filtered_null_step2.shape
# As seen, we removed a total of five columns.

In [None]:

#filtered_null_step2.isnull().sum().sort_values(ascending=False)
#filtered_null_step2 = filtered_null_step2.iloc[filtered_null_step2.isnull().sum(1).sort_values(ascending=0).index]

In [None]:
filtered_null_step2.head()

### Let's do a Data Segmentation and apply mean for remaining nulls

In [6]:
#Small <$50M, Medium = $50M to $50B, Large = $50B to $3T
filtered_null_step2['Bank Size'] = pd.cut(filtered_null_step2['Total Assets'], [0,50000, 50000000, 3000000000], 
labels=['Small', 'Medium', 'Large'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Step 1: Create a list of columns that are type float or int.  We will iterate over these columns and replace nulls 
with the mean value for the column based on banks segments small, medium and large.

Step 2: create a mask for banks that have null values in 'col_name' and are 'Small'...:
    
Step 3: get mean value for the col_name based on bank size.

Step 4: replace nulls with mean value

In [7]:
columns = [col_name for col_name, dtype in filtered_null_step2.dtypes.iteritems() if (dtype == np.float or dtype == np.int)]

In [8]:
filtered_null_step2.loc[filtered_null_step2['Bank Size'] == 'Small'].isnull().sum().sort_values(ascending=False)

Credit Loss Prov to Chargeoffs        200
Loan Loss Allow to noncurr Loans      147
Net Chargeoffs to Loans                17
Noncurrent Loans to Loans              17
Average Total Loans                    17
Loss Allow to Loans                    17
Net Loans Lease to core deposits        8
Assets per Emp                          6
Cash Div to Net Inc                     2
% Insured                               1
Average earning assets                  1
Cost of Funding Earnings Assets         1
Net Interest Margin                     1
Net Loans Lease to Deposits             1
Yield on Earning Assets                 1
Total Equity                            0
Perpertual Pref Stock                   0
All Other Liabilities                   0
Bank Equity                             0
Subordinated Debt                       0
Bank Size                               0
Common Stock                            0
Undivided Profits                       0
Equity Minor Interest             

In [32]:
for col_name in columns:
    if filtered_null_step2[col_name].isnull().any():
        #Remove nulls for small banks
        mask = (filtered_null_step2[col_name].isnull()) & (filtered_null_step2['Bank Size'] == 'Small')
        segment_mean = filtered_null_step2.loc[filtered_null_step2['Bank Size'] == 'Small', col_name].mean()
        filtered_null_step2.loc[mask, [col_name]] = segment_mean
        
        #Remove nulls for medium banks
        mask = (filtered_null_step2[col_name].isnull()) & (filtered_null_step2['Bank Size'] == 'Medium')
        segment_mean = filtered_null_step2.loc[filtered_null_step2['Bank Size'] == 'Medium', col_name].mean()
        filtered_null_step2.loc[mask, [col_name]] = segment_mean
        
        #Remove nulls for large banks
        mask = (filtered_null_step2[col_name].isnull()) & (filtered_null_step2['Bank Size'] == 'Large')
        segment_mean = filtered_null_step2.loc[filtered_null_step2['Bank Size'] == 'Large', col_name].mean()
        filtered_null_step2.loc[mask, [col_name]] = segment_mean

In [31]:
filtered_null_step2.loc[filtered_null_step2['Bank Size'] == 'Small'].isnull().sum().sort_values(ascending=False)

Loan Loss Allow to noncurr Loans      147
Noncurrent Loans to Loans              17
Loss Allow to Loans                    17
Average Total Loans                    17
Net Chargeoffs to Loans                17
Net Loans Lease to core deposits        8
Assets per Emp                          6
Cash Div to Net Inc                     2
Net Loans Lease to Deposits             1
Net Interest Margin                     1
Yield on Earning Assets                 1
Cost of Funding Earnings Assets         1
Average earning assets                  1
% Insured                               1
Total Equity                            0
All Other Liabilities                   0
Bank Equity                             0
Subordinated Debt                       0
Perpertual Pref Stock                   0
Bank Size                               0
Common Stock                            0
Surplus                                 0
Equity Minor Interest                   0
Noncurrent Loans Leases           

In [33]:
filtered_null_step2.isnull().sum().sort_values(ascending=False)

Bank Size                             0
Trading Liabilities                   0
Subordinated Debt                     0
All Other Liabilities                 0
Total Equity                          0
Bank Equity                           0
Perpertual Pref Stock                 0
Common Stock                          0
Surplus                               0
Undivided Profits                     0
Equity Minor Interest                 0
Noncurrent Loans Leases               0
Noncurrent Loans Leases Guaranteed    0
Income Earned                         0
Earning Assets                        0
Long-term Assets                      0
Average Total Assets                  0
Average Assets Quart                  0
Total Risk Weighted Assets            0
Adjusted Average Assets               0
Life Insurance Assets                 0
General Acct Life Insurance           0
Separate Acct Life Insurance          0
Hybrid Life Insurance                 0
Volatile Liabilities                  0


In [34]:
filtered_null_step2.to_csv('Data/car_v3.csv')