# Data Cleaning with Pandas and Jupyter Notebooks

import needed packages

[pathlib](https://docs.python.org/3/library/pathlib.html) - Python module to handle file system paths

In [1]:
import pandas as pd
from datetime import date
from pathlib import Path 

add parent directory to system paths so notebook can see scripts directory
https://stackoverflow.com/a/64562179

In [2]:
import sys
sys.path.append(str(Path.cwd().parent))

from scripts.normalize_data import (
    normalize_columns, 
    remove_whitespace_from_column_names,
    normalize_expedition_section_cols,
    remove_bracket_text,
    remove_whitespace_from_dataframe,
    ddm2dec,
    remove_empty_unnamed_columns,
    print_df
)

## get paths for files

Use Path and rglob to get all the cvs in `data_clean` directory.

In [3]:
paths = list(Path('..', 'processed_data', 'clean_data').rglob('*.csv'))
paths

[PosixPath('../processed_data/clean_data/Micropal_CSV_1/318_U1355A_Planktic_Forams.csv'),
 PosixPath('../processed_data/clean_data/Micropal_CSV_1/363-U1483A-planktic_forams.csv'),
 PosixPath('../processed_data/clean_data/Micropal_CSV_1/318_U1359A_Planktic_Forams.csv'),
 PosixPath('../processed_data/clean_data/Micropal_CSV_1/318_U1359D_Planktic_Forams.csv'),
 PosixPath('../processed_data/clean_data/Micropal_CSV_2/362_U1480E_planktic_forams.csv'),
 PosixPath('../processed_data/clean_data/Micropal_CSV_2/350_U1436B_planktic_forams.csv'),
 PosixPath('../processed_data/clean_data/Micropal_CSV_2/346_U1422A_planktic_forams.csv'),
 PosixPath('../processed_data/clean_data/Micropal_CSV_3/339_planktic_forams_U1387C.csv'),
 PosixPath('../processed_data/clean_data/Micropal_CSV_3/323_U1343B_planktic_forams.csv'),
 PosixPath('../processed_data/clean_data/Micropal_CSV_3/330_planktic_forams_U1374A.csv')]

In [4]:
len(paths)

10

## Read file

Use `pd.read_csv(dtype=str)` to read csv and treat all columns as strings. If you just use `pd.read_csv(path)`print_df(df, 1), pandas will automatically convert the columns to strings, integers, floats, dates. This automatic conversion can change values in unexpected ways such as converting integer columns into floats. 

In [5]:
df = pd.read_csv(paths[1], dtype=str)

`print_df` calls `pd.DataFrame.shape` and `pd.DataFrame.head()`

In [6]:
print_df(df)

(123, 229)


Unnamed: 0,Sample,Top [cm],Bottom [cm],Top Depth [m],Bottom Depth [m],Zone name (short),Zone name,Additional zone name (short),Additional zone name,Preservation,...,Cassigerinella martinezpicoi,Catapsydrax dissimilis,Catapsydrax unicavus_,Globigerinatella insueta,Globigerinatella sp.,Turborotalita clarkei,Sample comment,Ship File Links,Shore File Links,File Data
0,363-U1483A-1H-2-W 75/77-FORAM,0,2,2.25,2.27,PT1b [W11],Globorotalia truncatulinoides Partial-range Zo...,,,E [P46],...,,,,,,,,,,
1,363-U1483A-1H-4-W 75/77-FORAM,0,2,5.25,5.27,PT1b [W11],Globorotalia truncatulinoides Partial-range Zo...,,,E [P46],...,,,,,,,,,,
2,363-U1483A-1H-6-W 75/77-FORAM,0,2,8.25,8.27,PT1b [W11],Globorotalia truncatulinoides Partial-range Zo...,,,E [P46],...,,,,,,,,,,
3,363-U1483A-1H-CC-PAL-FORAM,0,5,8.58,8.63,PT1b [W11],Globorotalia truncatulinoides Partial-range Zo...,,,E [P46],...,,,,,,,Contains radiolaria,,,
4,363-U1483A-2H-2-W 74/76-FORAM,0,2,10.84,10.86,PT1b [W11],Globorotalia truncatulinoides Partial-range Zo...,,,E [P46],...,,,,,,,,,,


## Basic file cleanup

Use pandas functions do some basic file cleanup.

- delete dataframe column if all values are NA - [pandas.DataFrame.dropna](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html)

  df.dropna(axis='columns', how='all', inplace=True)  


- delete dataframe row if all values are NA - [pandas.DataFrame.dropna](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dropna.html)

  df.dropna(axis='index', how='all', inplace=True)  

- remove duplicate rows in dataframe - [pandas.DataFrame.drop_duplicates](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html)

  df.drop_duplicates(inplace=True)

run basic cleanup on all files

In [7]:
for path in paths:
    df = pd.read_csv(path, dtype=str, skipinitialspace = True)
    
    df.dropna(axis='columns', how='all', inplace=True)  
    df.dropna(axis='index', how='all', inplace=True)
    df.drop_duplicates(inplace=True)
    # remove_empty_unnamed_columns(df)
    
    df.to_csv(path, index=False)

You should `git commit` each cleanup step so you can undo changes if needed.

```
$ git add .
$ git commit -m 'do basic file cleanup on all files'
```

## remove leading and trailing white spaces

Since we want to remove white spaces for the headers and all the rows, 
use read_csv(header=None) and to_csv(header=False) to treat the first row 
like any other row.

In [8]:
for path in paths:
    df = pd.read_csv(path, dtype=str, header=None)
    
    remove_whitespace_from_dataframe(df)
    
    df.to_csv(path, index=False, header=False)

## get columns names

get all unique column headers by reading the headers from all the files

In [9]:
columns = set()
for path in paths:
    df = pd.read_csv(path, dtype=str, nrows=0)
    columns.update(df.columns)
    
len(columns)

# 493 with empty taxa columns
# 191 without empty taxa columns

191

In [10]:
columns

{'A/W',
 'Additional zone name',
 'Additional zone name (short)',
 'Beella digitata',
 'Bottom Depth [m]',
 'Bottom Depth[m] [m]',
 'Bottom [cm]',
 'Bottom depth [m]',
 'Bottom[cm] [cm]',
 'Candeina nitida',
 'Comment',
 'Comment (general)',
 'Comments',
 'Core',
 'Core-Sect',
 'Datum name',
 'Dentoglobigerina altispira',
 'Dentoglobigerina altispira _T_ _PL5',
 'Dentoglobigerina altispira _T_ _PL5_',
 'Dextral:Sinistral _P. obliquiloculata_',
 'Dextral:Sinistral _P. praecursor_',
 'Dextral:Sinistral _P. primalis_',
 'Exp',
 'Extra Sample ID Data',
 'Fragmentation',
 'Fragmentation rank [auto-pop]',
 'Genus/species (upper zone)',
 'Genus/species lower zone)',
 'Globigerina apertura',
 'Globigerina bulloides',
 'Globigerina calida',
 'Globigerina cf. woodi',
 'Globigerina decoraperta',
 'Globigerina falconensis',
 'Globigerina nepenthes',
 'Globigerinella aequilateralis',
 'Globigerinella calida',
 'Globigerinella calida _B',
 'Globigerinella calida _B_',
 'Globigerinella obesa',
 'Glob

separate taxa names from other headers

In [11]:
taxa_columns = {
 'Beella digitata',
 'Candeina nitida',
 'Dentoglobigerina altispira',
 'Dentoglobigerina altispira _T_ _PL5',
 'Dentoglobigerina altispira _T_ _PL5_',
 'Dextral:Sinistral _P. obliquiloculata_',
 'Dextral:Sinistral _P. praecursor_',
 'Dextral:Sinistral _P. primalis_',
 'Globigerina apertura',
 'Globigerina bulloides',
 'Globigerina calida',
 'Globigerina cf. woodi',
 'Globigerina decoraperta',
 'Globigerina falconensis',
 'Globigerina nepenthes',
 'Globigerinella aequilateralis',
 'Globigerinella calida',
 'Globigerinella calida _B',
 'Globigerinella calida _B_',
 'Globigerinella obesa',
 'Globigerinella siphonifera',
 'Globigerinita glutinata',
 'Globigerinita parkerae',
 'Globigerinita uvula',
 'Globigerinoides bulloideus',
 'Globigerinoides conglobatus',
 'Globigerinoides extremus',
 'Globigerinoides extremus _T and B',
 'Globigerinoides extremus _T and B_',
 'Globigerinoides fistulosus',
 'Globigerinoides obliquus _T',
 'Globigerinoides obliquus _T_',
 'Globigerinoides quadrilobatus',
 'Globigerinoides ruber',
 'Globigerinoides ruber (pink)',
 'Globigerinoides ruber (white)',
 'Globigerinoides ruber _pink_ T',
 'Globigerinoides ruber _pink_ _T_',
 'Globigerinoides sacculifer',
 'Globigerinoides sacculifer (without sack)',
 'Globigerinoides tenellus',
 'Globigerinoides trilobus',
 'Globigerinoidesella fistulosa _T and B_ _Pt1a',
 'Globigerinoidesella fistulosa _T and B_ _Pt1a_',
 'Globoconella miozea',
 'Globorotalia (Globoconella) inflata',
 'Globorotalia (Globorotalia) tumida tumida',
 'Globorotalia (Hirsutella) hirsuta',
 'Globorotalia (Hirsutella) scitula',
 'Globorotalia (Truncorotalia) crossaformis',
 'Globorotalia (Truncorotalia) truncatulinoides',
 'Globorotalia anfracta',
 'Globorotalia cf. crassula',
 'Globorotalia crassaformis',
 'Globorotalia crassaformis s.l. (dex)',
 'Globorotalia crassaformis s.l. (sin)',
 'Globorotalia crassaformis sensu lato',
 'Globorotalia flexuosa',
 'Globorotalia flexuosa _T and B_',
 'Globorotalia hessi',
 'Globorotalia hessi _B_',
 'Globorotalia hirsuta',
 'Globorotalia hirsuta (dex)',
 'Globorotalia hirsuta (sin)',
 'Globorotalia inflata',
 'Globorotalia limbata _B',
 'Globorotalia limbata _B_',
 'Globorotalia limbata _T_',
 'Globorotalia margaritae',
 'Globorotalia margaritae _T and B_ _PL3',
 'Globorotalia margaritae _T and B_ _PL3_',
 'Globorotalia menardii',
 'Globorotalia menardii (dex)',
 'Globorotalia menardii (sin)',
 'Globorotalia miocenica',
 'Globorotalia miotumida',
 'Globorotalia multicamerata',
 'Globorotalia multicamerata _T',
 'Globorotalia multicamerata _T_',
 'Globorotalia plesiotumida _B_ _M13b_',
 'Globorotalia plesiotumida _T',
 'Globorotalia plesiotumida _T_',
 'Globorotalia pseudomiocenica _T_ _PL6',
 'Globorotalia pseudomiocenica _T_ _PL6_',
 'Globorotalia puncticulata',
 'Globorotalia scitula',
 'Globorotalia scitula (dex)',
 'Globorotalia scitula (sin)',
 'Globorotalia tosaensis',
 'Globorotalia tosaensis _T and B_ _Pt1b',
 'Globorotalia tosaensis _T and B_ _Pt1b_',
 'Globorotalia truncatulinoides',
 'Globorotalia truncatulinoides (dex)',
 'Globorotalia truncatulinoides (sin)',
 'Globorotalia truncatulinoides _B',
 'Globorotalia truncatulinoides _B_',
 'Globorotalia tumida',
 'Globorotalia tumida _B_ _PL1a_',
 'Globoturborotalita apertura _T and B',
 'Globoturborotalita apertura _T and B_',
 'Globoturborotalita decoraperta _T and B',
 'Globoturborotalita decoraperta _T and B_',
 'Globoturborotalita rubescens',
 'Globoturborotalita tenella',
 'Neogloboquadrina acostaensis',
 'Neogloboquadrina acostaensis (dextral)',
 'Neogloboquadrina atlantica (dex)',
 'Neogloboquadrina atlantica (sin)',
 'Neogloboquadrina cf. pachyderma',
 'Neogloboquadrina dutertrei',
 'Neogloboquadrina humerosa',
 'Neogloboquadrina incompta (dextral)',
 'Neogloboquadrina nympha',
 'Neogloboquadrina pachyderma (dex)',
 'Neogloboquadrina pachyderma (sin)',
 'Neogloboquadrina pachyderma (sinistral)',
 'Neogloboquadrina pachyderma(dex)',
 'Orbulina universa',
 'Pulleniatina coiling (dextral)',
 'Pulleniatina coiling (sinistral)',
 'Pulleniatina finalis',
 'Pulleniatina finalis _B',
 'Pulleniatina finalis _B_',
 'Pulleniatina obliquiloculata',
 'Pulleniatina obliquiloculata (D)',
 'Pulleniatina praecursor',
 'Pulleniatina praespectabilis',
 'Pulleniatina primalis  _Tand B',
 'Pulleniatina primalis  _Tand B_',
 'Sphaeroidinella dahiscens sensu lato',
 'Sphaeroidinella dehiscens',
 'Sphaeroidinella dehiscens s.l.',
 'Sphaeroidinella dehiscens sensu lato _B_',
 'Sphaeroidinellopsis kochi _T',
 'Sphaeroidinellopsis kochi _T_',
 'Sphaeroidinellopsis seminulina',
 'Sphaeroidinellopsis seminulina _T_ _PL4',
 'Sphaeroidinellopsis seminulina _T_ _PL4_',
 'Sphaeroidinellopsis subdehiscens',
 'Turborotalita humilis',
 'Turborotalita quinqueloba',
}

In [12]:
len(taxa_columns)

141

In [13]:
nontaxa_columns = columns - taxa_columns

nontaxa_columns

{'A/W',
 'Additional zone name',
 'Additional zone name (short)',
 'Bottom Depth [m]',
 'Bottom Depth[m] [m]',
 'Bottom [cm]',
 'Bottom depth [m]',
 'Bottom[cm] [cm]',
 'Comment',
 'Comment (general)',
 'Comments',
 'Core',
 'Core-Sect',
 'Datum name',
 'Exp',
 'Extra Sample ID Data',
 'Fragmentation',
 'Fragmentation rank [auto-pop]',
 'Genus/species (upper zone)',
 'Genus/species lower zone)',
 'Group Abundance',
 'Group abundance',
 'Group abundance (%)',
 'Hole',
 'Label ID',
 'PF Preservation',
 'PF group abundance',
 'PF preservation',
 'Preservation',
 'Pteropod group abundance',
 'Sample',
 'Sample comment',
 'Section',
 'Site',
 'Top Depth [m]',
 'Top Depth[m] [m]',
 'Top [cm]',
 'Top depth [m]',
 'Top[cm] [cm]',
 'Type',
 'Type (lower zone)',
 'Upper boundary age av. [Ma]',
 'Zone author (year)',
 'Zone group',
 'Zone name',
 'Zone name (short)',
 'pc_abundance_name_mode',
 'pc_fossil_group',
 'pc_fossil_name',
 'physical_constituent_name'}

In [14]:
len(nontaxa_columns)

50

## create taxa and non-taxa file

taxa csv

In [15]:
taxa_df = pd.DataFrame(taxa_columns, columns=['verbatim_name'])
taxa_df.sort_values('verbatim_name', inplace=True)

print_df(taxa_df)

(141, 1)


Unnamed: 0,verbatim_name
77,Beella digitata
29,Candeina nitida
51,Dentoglobigerina altispira
76,Dentoglobigerina altispira _T_ _PL5
96,Dentoglobigerina altispira _T_ _PL5_


In [16]:
path = Path('..', 'processed_data', 'drafts', 'taxa_list.csv')
taxa_df.to_csv(path, index=False)

non-taxa csv

In [17]:
non_taxa_df = pd.DataFrame(nontaxa_columns, columns=['field'])
non_taxa_df.sort_values('field', inplace=True)

print_df(non_taxa_df)

(50, 1)


Unnamed: 0,field
46,A/W
25,Additional zone name
49,Additional zone name (short)
45,Bottom Depth [m]
16,Bottom Depth[m] [m]


In [18]:
path = Path('..', 'processed_data', 'drafts', 'nontaxa_list.csv')
non_taxa_df.to_csv(path, index=False)

## Normalize headers

In [19]:
for path in paths:
    df = pd.read_csv(path, dtype=str)    
    cols = df.columns
    
    variants = ['Bottom Depth [m]', 'Bottom depth [m]', 'Bottom Depth[m] [m]']
    replacement = 'Bottom Depth [m]'
    cols = normalize_columns(variants, replacement, cols)

    variants = ['Bottom [cm]', 'Bottom[cm] [cm]']
    replacement = 'Bottom [cm]'
    cols = normalize_columns(variants, replacement, cols)

    
    variants = ['Label ID', 'Sample']
    replacement = 'Sample'
    cols = normalize_columns(variants, replacement, cols)

    variants = ['Top Depth [m]', 'Top depth [m]', 'Top Depth[m] [m]']
    replacement = 'Top Depth [m]'
    cols = normalize_columns(variants, replacement, cols)

    variants = ['Top [cm]', 'Top[cm] [cm]']
    replacement = 'Top [cm]'
    cols = normalize_columns(variants, replacement, cols)
    
    variants = ['Comment', 'Comment (general)', 'Comments', 'Sample comment']
    replacement = 'Comment'
    cols = normalize_columns(variants, replacement, cols)
    
    variants = ['Genus/species lower zone)']
    replacement = 'Genus/species (lower zone)'
    cols = normalize_columns(variants, replacement, cols)
    
    variants = ['Group Abundance', 'Group abundance', 'Group abundance (%)']
    replacement = 'Group Abundance'
    cols = normalize_columns(variants, replacement, cols)
    
    variants = ['PF Preservation', 'PF preservation']
    replacement = 'PF preservation'
    cols = normalize_columns(variants, replacement, cols)
    
    df.columns = cols 
    df.to_csv(path, index=False)


## remove bracket text

In [20]:
for path in paths:
    df = pd.read_csv(path, dtype=str)
    
    df = remove_bracket_text(df)
    
    df.to_csv(path, index=False)

## convert sample column to  Exp...A/W colums

In [21]:
for path in paths:
    df = pd.read_csv(path, dtype=str)   
    
    df = normalize_expedition_section_cols(df)
    
    df.to_csv(path, index=False) 

## check mandatory columns

In [22]:
required_columns = {
 'A/W',
 'Bottom [cm]',
 'Bottom Depth [m]',
 'Core',
 'Exp',
 'Hole',
 'Sample',
 'Section',
 'Site',
 'Top [cm]',
 'Top Depth [m]',
 'Type'
}

In [23]:
for path in paths:
    df = pd.read_csv(path, dtype=str)    
    cols = set(df.columns)
    diff = required_columns - cols
    
    if(len(diff) > 0):
        print(path)
        print(required_columns - cols)
    