In [None]:
import pandas as pd
import numpy as np
import os, sys
import time

In [None]:
'''
# Table with information about the dataset:
# https://www.nature.com/articles/s41597-019-0219-y/tables/2?draft=collection

Download the dataset and place it in a folder that is reachable from this notebook.

Set DATA_PATH to the filename of the CSV file.

You can use os.listdir('./') to see which files are in the same directory as your notebook
'''

DATA_PATH = './m2b_marina_biota/190516_m2b_marine_biota.csv'

print(os.listdir('./'))

In [None]:
df = pd.read_csv(DATA_PATH, delimiter=";")

relevant_columns = [
    'samp_date', # going to be renamed to 'years'
    'country',
    'mea_ug_kg_orig', # mean mercury concentration
    'troph_lev',
    'mar_habit',
    'location', # NB! does it make sense to have both location and country?
    'lenght_cm',
    'weight_g',
    'tissue_cod',
    'spec_nam_s',
    'age_y'
]

df = df[relevant_columns].copy()
df.dropna() # remove missing values

# parse dates to integer years and rename column to 'years'
df['samp_date'] = df['samp_date'].apply(lambda s_time: time.strptime(s_time, '%Y-%m-%d').tm_year)
df = df.rename(columns={'samp_date': 'year'})

# remove all rows whose numerical col value == -9999.0, which is the none type format in the dataset
print(f"Shape before trim: {df.shape}")
for col in df.columns:
    df = df[df[col] != -9999.0]
print(f"Shape after trim: {df.shape}")

# encode string valued columns to integer ids
cols_to_encode = [
    'country',
    'mar_habit',
    'location',
    'spec_nam_s',
    'tissue_cod'
]

# TO DO: make a remainder dataframe for the keys
for col in cols_to_encode:
    df[col], key = df[col].factorize() # key corresponds to e.g. country codes (IT, IR etc.)

df.sort_values(by=['year'], inplace=True)

# restart row count
df = df.reset_index()
df = df.drop(columns=['index'])

# export to CSV
print(f"Exporting dataset with {df.shape[1]} columns and {df.shape[0]} rows")
df.to_csv("dataset.csv", index_label='id')

In [None]:
# split dataset into blue, yellow and red of the following format:
# 1, 1970 1 0.15 2.38 9.67 37.22;

# only use 20 counties
df = df[df['country'] < 20]

cols = df.columns.to_list()[2:] # don't use year and country
idx = len(cols) // 3 # number of cols must be a multiple of 3 as of now

blue_cols = cols[:idx]
yellow_cols = cols[idx:2*idx]
red_cols = cols[2*idx:3*idx]

colors = ['blue', 'yellow', 'red']
rgb_cols = [cols[idx*i:idx*(i+1)] for i, c in enumerate(colors)]
rgb_df = [df[['year', 'country'] + rgb_cols[i]] for i, c in enumerate(rgb_cols)]
         
for i, frame in enumerate(rgb_df):
    print(f"Exporting dataset {colors[i]} with {frame.shape[1]} columns and {df.shape[0]} rows")
    frame.to_csv(f"{colors[i]}.txt", 
        sep=" ", 
        line_terminator= ";" + os.linesep, 
        index_label='id', 
        header=False,
        index=False
    )