# Data Cleaning
> Objectives:
    - Given a messy dataset - extract meaningful information from it. 
    - Learn from the data.

In [1]:
import pandas as pd
import numpy as np

In [4]:
vehicles = pd.read_csv('vehicles_messy.csv', low_memory=False)

In [None]:
vehicles.head()

In [None]:
# display more columns on the dataframe visualization.
pd.options.display.max_columns = 99
# usar com parcimonia.

In [None]:
vehicles.head()

In [None]:
vehicles_not_bkp = vehicles

In [None]:
vehicles_not_bkp['barrels08'] = 100

In [None]:
vehicles_not_bkp.head()

In [None]:
# vehicles also changes :(
vehicles.head()

When you say: `vehicles_2 = vehicles`, both vehicles_2 and vehicles points to the same location (to optimize storage). If you want to really create a copy to serve as a backup, you have to use the `.copy()` method.

In [None]:
# create a copy of our dataset for backup.
vehicles_bkp = vehicles.copy()

In [None]:
# Describing our dataset.
vehicles.describe()

In [None]:
# vehicles.describe().loc[['mean','std'], :'cityCD']

In [None]:
# Obtaining some metadata from our dataframe.
vehicles.info()

# Data types of our dataset

In [None]:
vehicles.dtypes

In [None]:
vehicles.dtypes == 'float64'

In [None]:
mask = (vehicles.dtypes == 'object')

In [None]:
vehicles.dtypes

In [None]:
vehicles.dtypes.loc[mask]

In [None]:
selected_columns = vehicles.dtypes.loc[mask].index
selected_columns

In [None]:
vehicles.loc[:, selected_columns]

In [None]:
vehicles.select_dtypes('object')

## `.astype()`

Converts the type of the column

In [None]:
vehicles.year.astype(float)

# Null (or Missing) values - 

>    - Called NaN: <u>Not A Number</u>
>    - Count 
>    - <b>Mask</b> concept

In [None]:
vehicles.head()

In [None]:
vehicles.isna()

In [None]:
vehicles.isnull()

## Let's select the displ column and see how many missing values this column has.

In [None]:
vehicles['displ'].isnull()

In [None]:
# How to count it how many missing there are?

# How to get the percentage of missing values?

In [None]:
# sum of a mask
vehicles['displ'].isnull().sum()

In [None]:
# mean of a mask
vehicles['displ'].isnull().mean()

## What if I wanted to count the number of missing values for each column of the whole dataframe?

In [None]:
vehicles.isna()

In [None]:
vehicles.isnull().sum()

In [None]:
vehicles.isna().mean()

## What if we wanted to count Null values for each row?
    - axis = 1 
    


In [None]:
vehicles.isnull().sum(axis=1)

In [None]:
vehicles.isnull().mean(axis=1)

## Mask of a mask (wat)

In [None]:
vehicles.isna().mean()

In [None]:
mask = (vehicles.isna().mean() < 0.8)
mask

In [None]:
mask.loc[mask]

In [None]:
# the index of this results are the name of the columns in which that condition is True
selected_columns = mask.loc[mask].index
selected_columns

In [None]:
vehicles

## What if I wanted to select only the rows in which there are at least one missing value (in any column)?

In [None]:
# this will return True if there is any missing in each COLUMN. 
vehicles.isnull().any() 

# How can I check it for each rows? What is the syntax for checking for rows instead of columns?

In [None]:
vehicles.isnull().any(axis=1)

# Dropping columns
> `axis=1` or

> `columns = ['name_of_column_to_drop1', 'name_of_column_to_drop2' ,...]`

In [None]:
vehicles.head()

In [None]:
vehicles['barrelsA08']

In [None]:
vehicles.drop(columns=['barrelsA08'])

In [None]:
vehicles.drop(columns=['barrels08','barrelsA08'])

In [None]:
vehicles.head(2)

In [None]:
vehicles = vehicles.drop(columns=['barrels08','barrelsA08'])
# or
# vehicles.drop(columns=['modifiedOn','barrels08'], inplace=True)

In [None]:
vehicles = vehicles_bkp.copy()

# Drop columns based on condition

We need to get the name of the columns using masks.

## Lets store the number of missing values in a variable, and based on the number of missing values that column has, lets remove it or not

In [None]:
vehicles.isnull().sum()

In [None]:
n_missings = vehicles.isnull().sum()

In [None]:
n_missings

### create a condition in which you get True for the indexes that have more than 10000 columns.

In [None]:
n_missings > 10000

### select from the n_missings pandas Series the ones that returned True.

In [None]:
n_missings.loc[n_missings > 10000]

### get the index of that pandas series.

In [None]:
n_missings.loc[n_missings > 10000].index

In [None]:
### store it in a variable called `columns_to_drop`, for example

columns_to_drop = n_missings.loc[n_missings > 10000].index

In [None]:
# convert into a list just to be clearer
columns_to_drop = list(columns_to_drop)
columns_to_drop

In [None]:
vehicles.drop(columns=columns_to_drop)


-----

# Understand the data (!)
>    - Deep dive into data
>    - <b>Understand the business</b> you're working with
>    - Understand how (and if) you can input those values as more meaningful information
>    - Fill not a number

## Lets get all columns where the displacement is missing.

In [None]:
# passo a passo
vehicles['displ']

In [None]:
# get a mask of which rows are missing
vehicles['displ'].isnull()

In [None]:
# create a new dataframe for those rows where the displacement is missing.

In [None]:
condition = vehicles['displ'].isnull()
vehicles.loc[condition, :]

In [None]:
missing_displacement = vehicles.loc[condition, :]
missing_displacement

In [None]:
# Lets just see some columns of it.
missing_displacement = missing_displacement[['year', 'make', 'model', 'trany', 'drive','fuelType','cylinders', 'displ']]
missing_displacement.sample(10)

In [None]:
missing_displacement

In [None]:
vehicles.loc[:, ['year', 'make', 'model', 'trany', 'drive','fuelType','cylinders', 'displ']]

In [None]:
## There's some explanation for why the displacement is zero. What is it?

missing_displacement['fuelType'].value_counts()

In [None]:
missing_displacement.query('fuelType == "Regular"')

In [None]:
vehicles.loc[[21413,21414], 'fuelType'] = 'Regular2'

In [None]:
vehicles.loc[[21413,21414], 'fuelType']

In [None]:
missing_displacement[['displ', 'cylinders']].fillna(0)

In [None]:
#mask = missing_displacement['fuelType'] == 'Electricity'
missing_displacement.loc[missing_displacement['fuelType'] == 'Electricity', ['cylinders', 'displ']] = missing_displacement.loc[missing_displacement['fuelType'] == 'Electricity', ['cylinders', 'displ']].fillna(0)


In [None]:
missing_displacement.query('make == "Subaru"')

In [None]:
missing_displacement[['fuelType', 'displ', 'cylinders']].fillna(0)

In [None]:
## So now that I know this, how can I replace the missing values by the value I want?
vehicles.loc[:, ['displ', 'cylinders']] = vehicles.loc[:, ['displ', 'cylinders']].fillna(0)

In [None]:
vehicles.displ.isna().sum()

## Use cases
>    - Cheque devolvido = 0 _vs_ Cheque devolvido = NaN

# Correct wrong data

In [None]:
# create a condition for which cylinders value == 0
no_cylinder = (vehicles['cylinders'] == 0)


# create a condition for which displacement is different from 0
yes_displacement = (vehicles['displ'] != 0)

In [None]:
(no_cylinder & yes_displacement).sum()

In [None]:
vehicles.loc[no_cylinder & yes_displacement, :]

In [None]:
test = vehicles.loc[no_cylinder & yes_displacement, :]

test[['year', 'make', 'model', 'trany', 'drive','fuelType','cylinders', 'displ']]

In [None]:
mask = vehicles['make'] == 'Mazda'
vehicles.loc[mask]

In [None]:
left_table = vehicles

In [None]:
right_table = vehicles.groupby(by=['make','year']).median()[['cylinders']]

In [None]:
right_table

In [None]:
merged_table = pd.merge(left=left_table, right=right_table, on=['make','year'])

In [None]:
merged_table.loc[no_cylinder & yes_displacement, 'cylinders_x'] = merged_table.loc[no_cylinder & yes_displacement, 'cylinders_y']

# Dropping or checking duplicates rows
    
>    - Dropping fully duplicate row
>    - Subset
>    - `.duplicated()`

In [None]:
test = pd.DataFrame({'cpf':[1,2,2,2,2], 'vlr':[10,20,35,20,25], 'ano':[1992, 1993, 1993, 1993, 1994]})

In [None]:
test

In [None]:
test.loc[test.duplicated(keep=False), :]

In [None]:
test.duplicated(subset=['cpf'] , keep=False)

In [None]:
test.drop_duplicates()

In [None]:
test.drop_duplicates(subset=['cpf'])

In [None]:
test.groupby('cpf').agg({'vlr':sum, 'ano':'max'}).reset_index()

# MissingNo

In [None]:
!pip install missingno --user

In [None]:
import missingno as msno
msno.matrix(vehicles.sample(2500))

In [None]:
msno.bar(vehicles)