In [None]:
import pandas as pd

In [None]:
import wget
url = 'http://www.cs.wcupa.edu/rburns/DataMining/notebooks/MotorInsuranceFraudClaimABTFull.csv'
wget.download(url)

# Loading Data

In [None]:
df = pd.read_csv("MotorInsuranceFraudClaimABTFull.csv")
print("Dimensions of dataframe: ", df.shape)
print("Number of rows: ", len(df))
print("Number of colums: ", len(df.columns))
df.dtypes

# Handling Missing Values

See [Working with missing data](https://pandas.pydata.org/pandas-docs/stable/user_guide/missing_data.html)

`pandas` is robust enough to handle missing data. Descriptive statistics, such as `mean`, excludes any objects with missing data.

`pandas` uses the floating value `NaN` (Not a Number) to represent missing data (sometimes abbreviated "NA").

Below, note that `NaN` is correctly entered into the dataframe in the situation where the rows have missing data.

In [None]:
print("Number of rows: ", len(df))
df.head()

## Deleting rows or columns with missing data

### Deleting rows with missing data

By default, `df.dropna` will drop any row containing a missing value.

In [None]:
cleaned = df.dropna()
print("Number of cleaned rows: ", len(cleaned))
print("Number of cols remains the same: ", len(cleaned.columns))
cleaned.head()

### Deleting columns with missing data

In [None]:
cleaned = df.dropna(axis=1)
print("Number of rows remains the same: ", len(cleaned))
print("Number of cleaned cols: ", len(cleaned.columns))
cleaned.head()

## Imputing missing values

### Numerical Features

In [None]:
df.isna().head(10)

In [None]:
df['Num Soft Tissue'][8]

In [None]:
copy = df.fillna(df.mean())     # produces a new copy, original df is unmodified
copy.head(10)

In [None]:
print(df['Num Soft Tissue'][8])
print(copy['Num Soft Tissue'][8])

### Categorical Features

This isn't implemented in `pandas`.

Instead, we'll use `sklearn.impute` (not yet discussed in the course).

[sklearn imputation of missing values](https://scikit-learn.org/stable/modules/impute.html#impute)

## Covariance and Correlation

In [None]:
print("Covariance between numberical variables")
print("---------------------------------------")
df.cov()

In [None]:
print("Correlation between numberical variables")
print("----------------------------------------")
df.corr()

In [None]:
import matplotlib.pyplot as plt

plt.matshow(df.corr())
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = [15, 10]
pd.plotting.scatter_matrix(df)
plt.show()

# Removing Rows or Columns

## Removing Rows

In [None]:
df.head(10)

In [None]:
## dropping the 5th row
newdf = df.drop(4)   # 5th row is index=4
newdf.head(10)

## Removing Columns

In [None]:
df.head(10)

In [None]:
# dropping the marital status column
newdf = df.drop('Marital Status', axis=1)
newdf.head()