# Data Cleaning with Pandas and Jupyter Notebooks

In [26]:
import pandas as pd
from pathlib import Path 

import sys
sys.path.append(str(Path.cwd().parent))

from scripts.normalize_data import (
    normalize_columns, 
    normalize_expedition_section_cols,
    remove_bracket_text,
    remove_whitespace,
    print_df
)

# from scripts.my_normalize_data import (
#     normalize_columns, 
#     normalize_expedition_section_cols,
#     remove_bracket_text,
#     remove_whitespace,
# )

In [27]:
paths = list(Path('..', 'processed_data', 'clean_data').rglob('*.csv'))

normalized_nontaxa_path = Path('..', 'processed_data', 'normalized_nontaxa_list.csv')

In [28]:
len(paths)

10

## Basic file cleanup

Use pandas built in functions to do these steps

- delete dataframe column if all values are NA 

  dropna(axis='columns', how='all', inplace=True)  

- delete dataframe row if all values are NA 

  dropna(axis='index', how='all', inplace=True) 

- remove duplicate rows in dataframe 

  drop_duplicates(inplace=True)

In [29]:
for path in paths:
    df = pd.read_csv(path, dtype=str)
    
    # code to change file   
    
    df.to_csv(path, index=False)

## remove leading and trailing white spaces

created  `remove_whitespace` to remove all leading and trailing white spaces from a dataframe. 

Since we wanted to remove white spaces from the headers, we used `read_csv(header=None)` and `to_csv(header=False)` so that pandas treat the first row like any other row.

In [30]:
for path in paths:
    df = pd.read_csv(path, dtype=str, header=None)
    
    remove_whitespace(df)
    
    df.to_csv(path, index=False, header=False)

## Normalizing columns names

### get all unique column names

Use `pandas.DataFrame.columns()` and python `set` to get all the unique columns for all the files.

In [14]:
all_columns = set()

for path in paths:
    df = pd.read_csv(path, dtype=str, nrows=0)
    
    # code to get all column names  
    


In [15]:
all_columns

set()

Manually separate taxa names from other headers so that we could do some more processing on the taxa.

In [16]:
taxa_columns = set()

In [17]:
len(taxa_columns)

0

Given `all_columns` and `taxa_columns`, find the `nontaxa_columns`.

In [18]:
# code to get nontaxa_columns
nontaxa_columns = None

In [78]:
len(nontaxa_columns)

472

### create taxa and non-taxa file

Save the taxa and nontaxa headers to csv.

In [79]:
path = Path('..', 'processed_data', 'drafts', 'taxa_list.csv')

# create taxa dataframe an save it as a csv

In [19]:
path = Path('..', 'processed_data', 'drafts', 'nontaxa_list.csv')

# create nontaxa dataframe an save it as a csv

### normalize headers

create `normalize_columns` to update the column names for a data frame.

In [6]:
nontaxa_df = pd.read_csv(normalized_nontaxa_path, dtype=str)
print_df(nontaxa_df)

(44, 2)


Unnamed: 0,field,normalized_field
0,A/W,A/W
1,Additional zone name,Additional zone name
2,Additional zone name (short),Additional zone name (short)
3,Bottom Depth [m],Bottom Depth [m]
4,Bottom Depth[m] [m],Bottom Depth [m]


create a dictionary that lists the original field name and normalized field name.

In [60]:
nontaxa_mapping = nontaxa_df.set_index('field').to_dict()['normalized_field']


normalize columns for all files

In [27]:
for path in paths:
    df = pd.read_csv(path, dtype=str)    
    
    normalize_columns(df, nontaxa_mapping)
    
    df.to_csv(path, index=False)


## Clean up row values

create `remove_bracket_text` to remove the [text] values at the end of some taxa columns.

In [51]:
for path in paths:
    df = pd.read_csv(path, dtype=str)
    
    df = remove_bracket_text(df)
    
    df.to_csv(path, index=False)

## Turn one column into multiple columns 

For some files, `Sample` column was given, but `Exp, Site, Hole, Core, Type, Section, A/W` columns where not given. `normalize_expedition_section_cols` converts `Sample` into separate `Exp, Site, Hole, Core, Type, Section, A/W` columns. 

In [23]:
for path in paths:
    df = pd.read_csv(path, dtype=str)   
    
    df = normalize_expedition_section_cols(df)
    
    df.to_csv(path, index=False) 

## check if mandatory columns exists

In [24]:
required_columns = {
 'A/W',
 'Bottom [cm]',
 'Bottom Depth [m]',
 'Core',
 'Exp',
 'Hole',
 'Sample',
 'Section',
 'Site',
 'Top [cm]',
 'Top Depth [m]',
 'Type'
}

In [25]:
for path in paths:
    df = pd.read_csv(path, dtype=str)    
    cols = set(df.columns)
    diff = required_columns - cols
    
    if(len(diff) > 0):
        print(path)
        print(required_columns - cols)
    

../processed_data/clean_data/Micropal_CSV_1/318_U1355A_Planktic_Forams.csv
{'Sample'}
../processed_data/clean_data/Micropal_CSV_1/318_U1359A_Planktic_Forams.csv
{'Sample'}
../processed_data/clean_data/Micropal_CSV_1/318_U1359D_Planktic_Forams.csv
{'Sample'}
../processed_data/clean_data/Micropal_CSV_3/323_U1343B_planktic_forams.csv
{'Sample'}
../processed_data/clean_data/Micropal_CSV_3/330_planktic_forams_U1374A.csv
{'Sample', 'Top Depth [m]', 'Top [cm]', 'Bottom Depth [m]', 'Bottom [cm]'}
