# EDA on raw [census income data](https://archive.ics.uci.edu/ml/datasets/census+income)

### Import libraries

In [1]:
import pandas as pd
import pandas_profiling

### Load data

In [2]:
RAW_DATA_PATH = "../data/raw/census.csv"
raw_df = pd.read_csv(RAW_DATA_PATH)

### Run pandas profiling on raw data

In [3]:
raw_profile = pandas_profiling.ProfileReport(raw_df, explorative=True)

In [4]:
RAW_PROFILE_HTML_PATH = "../data/raw/raw_profile.html"

In [5]:
raw_profile.to_file(RAW_PROFILE_HTML_PATH)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

#### Insights from profiling

- column names and text/categorical values contain empty spaces -> remove them
- 23 duplicate rows -> dedup  
- missing values are coded as `?` -> explicitly make it `None` and remove them
- since we are training RF model, we should not worry to much about correlated predictors -> retain them in the data

### Preprocessing 

In [21]:
# remove whitespaces from column names
new_col_names = [col.strip() for col in raw_df.columns]
raw_df.columns = new_col_names

In [46]:
# remove whitespaces from values
def whitespace_remover(dataframe: pd.DataFrame) -> None:
    """
    Funciton for removing whitepaces from dataframe
    """
    # iterating over the columns
    for i in dataframe.columns:
         
        # checking datatype of each columns
        if dataframe[i].dtype == 'object':
             
            # applying strip function on column
            dataframe[i] = dataframe[i].map(str.strip)

whitespace_remover(raw_df)

In [47]:
# dedup
raw_df_deduped = raw_df.drop_duplicates(ignore_index=True)

In [52]:
# Replace `?` with `None` and drop
prepped_df = raw_df_deduped.replace({"?": None}).dropna()

### Run pandas profiling on prepped data

In [54]:
prepped_profile = pandas_profiling.ProfileReport(prepped_df, explorative=True)

In [55]:
PREPPED_PROFILE_HTML_PATH = "../data/prepped/prepped_profile.html"

In [56]:
prepped_profile.to_file(PREPPED_PROFILE_HTML_PATH)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Save prepped data

In [61]:
PREPPED_DATA_PATH = "../data/prepped/census.csv"
prepped_df.to_csv(PREPPED_DATA_PATH, index=False)