## Jupyter notebook notes to run: 
### Change the ACCESS_KEY and SECRET_KEY to AWS Credentials. 


In [1]:
import pandas as pd
import s3fs
from pandas import DataFrame
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from boto3.session import Session

In [None]:
# Change this to 
ACCESS_KEY = 'XXX'
SECRET_KEY = 'XXX'

session = Session(aws_access_key_id=ACCESS_KEY,
                  aws_secret_access_key=SECRET_KEY)
s3 = session.resource('s3')
your_bucket = s3.Bucket('bathymetry')

for s3_file in your_bucket.objects.all():
    print(s3_file.key)

In [None]:
# Read in data from a subset of a region's dataset.
import random 

def get_data_from_s3(prefix, is_jamstec, is_test):
    
    new_df = df_ = pd.DataFrame()
    
    # 100 corresponds to 20% of a region's dataset
    # 250 corresponds to 50% of a region's dataset
    
    num_samples = 50
    
    if is_jamstec: 
        rand_list = random.sample(range(175), num_samples)
    elif is_test: 
        rand_list = random.sample(range(108), 108)
    else: 
        rand_list = random.sample(range(500), num_samples)

    counter = 0 
    for r in rand_list:   
        if 0 <= r <= 9: 
            new_prefix = prefix[:-1]
        elif 10 <= r <= 99: 
            new_prefix = prefix[:-2]
        elif 100 <= 999: 
            new_prefix = prefix[:-3]
        new_prefix += str(r)
        new_prefix += '.tsv'
        counter += 1
        if counter % 10 == 0: 
            print(new_prefix, " ", counter)
        df = pd.read_csv(new_prefix, sep=' ', header=None)
        new_df = new_df.append(df)
        
    return new_df


In [None]:
# takes fewer lines from each file but looks at more files 
import random 

def get_more_data_from_s3(prefix, is_jamstec, is_test):
    
    new_df = df_ = pd.DataFrame()
    
    # 100 corresponds to 20% of a region's dataset
    # 250 corresponds to 50% of a region's dataset
    
    num_samples = 200
    
    if is_jamstec: 
        rand_list = random.sample(range(175), num_samples)
    elif is_test: 
        rand_list = random.sample(range(108), num_samples)
    else: 
        rand_list = random.sample(range(500), num_samples)

    counter = 0 
    for r in rand_list:   
        if 0 <= r <= 9: 
            new_prefix = prefix[:-1]
        elif 10 <= r <= 99: 
            new_prefix = prefix[:-2]
        elif 100 <= 999: 
            new_prefix = prefix[:-3]
        new_prefix += str(r)
        new_prefix += '.tsv'
        counter += 1
        if counter % 10 == 0: 
            print(new_prefix, " ", counter)
        df = pd.read_csv(new_prefix, sep=' ', header=None, nrows=1000)
        new_df = new_df.append(df)
        
    return new_df

In [None]:
prefix = 's3://bathymetry/JAMSTEC/JAMSTEC-part00000'
jamstec_df = get_more_data_from_s3(prefix, False, False)
jamstec_df.head()

In [None]:
prefix = 's3://bathymetry/JAMSTEC2/JAMSTEC2-part00000'
jamstec2_df = get_more_data_from_s3(prefix, True, False)
jamstec2_df.head()

In [None]:
prefix = 's3://bathymetry/NGDC/NGDC-part00000'
ngdc_df = get_more_data_from_s3(prefix, False, False)
ngdc_df.head()

In [None]:
prefix = 's3://bathymetry/SIO/SIO-part00000'
sio_df = get_more_data_from_s3(prefix,  False, False)
sio_df.head()

In [None]:
prefix = 's3://bathymetry/US_multi/US_multi-part00000'
us_multi_df = get_more_data_from_s3(prefix, False, False)
us_multi_df.head()

In [None]:
prefix = 's3://bathymetry/test/test-part00000'
test_df = get_more_data_from_s3(prefix, False, True)
test_df.head()

In [None]:
def processing_df(df):
    cols = ["lon", "lat", "depth", "sigh", "sigd", "SID", "pred", "ID", "(pred-depth)/depth", "d10", 'd20', "d60", "age", "VGG", "rate", "sed", "roughness", "G:T", "NDP2.5m", "NDP5m", "NDP10m", "NDP30m", "STD2.5m", "STD5m", "STD10m", "STD30m", "MED2.5", "MED5m", "MED10m", "MED30m", "D-MED2.5m/STD2.5m", "D-MED5m/STD5m", "D-MED10m/STD10m", "D-MED30m/STD30m", "year", "kind"]
    df.columns=cols
    
    # Count number of good vs. bad cruises, using sigd=9999 for false
    df.loc[(df.sigd != "9999"), 'sigd'] = 1
    df.loc[(df.sigd == "9999"), 'sigd'] = 0
    df = df.drop(columns=['SID', 'ID', 'kind'])

    # Convert all columns to float: 
    new_cols = df.columns
    for c in new_cols: 
        df = df[~df[c].isin(["NaN"])]
        df[c] = pd.to_numeric(df[c])
        
    return df

In [None]:
jamstec_df = processing_df(jamstec_df)
jamstec2_df = processing_df(jamstec2_df)
ngdc_df = processing_df(ngdc_df)
us_multi_df = processing_df(us_multi_df)
sio_df = processing_df(sio_df)
test_df = processing_df(test_df)

In [None]:
new_cols = jamstec_df.columns
print(new_cols)

# Create 2D mappings of the dataset using latitude and longitude: 

In [None]:
colors = ['blue', 'orange', 'green', 'red', 'purple', 'gray']
labels = ["JAMSTEC", "JAMSTEC2", "SIO", "NGDC", "US_MULTI", "TEST"]
list_dfs = [jamstec_df, jamstec2_df, sio_df, ngdc_df, us_multi_df, test_df]

plt.rcParams.update({'font.size': 22})
plt.figure(figsize=(20,10))

for i in range(0, len(list_dfs)):
    print(labels[i])
    lon = list_dfs[i]['lon'].values
    lat = list_dfs[i]['lat'].values
    plt.scatter(lon, lat, c=colors[i], label=labels[i])
                

plt.title('Mappings of different cruise regions')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.legend()

plt.show()


In [None]:
colors = ['blue', 'orange', 'green', 'red', 'purple', 'gray']
labels = ["JAMSTEC", "JAMSTEC2", "SIO", "NGDC", "US_MULTI", "TEST"]
list_dfs = [jamstec_df, jamstec2_df, sio_df, ngdc_df, us_multi_df, test_df]


for i in range(0, len(list_dfs)-1):
    plt.rcParams.update({'font.size': 22})
    plt.figure(figsize=(20,10))

    lon = list_dfs[i]['lon'].values
    lat = list_dfs[i]['lat'].values

    plt.scatter(x=lon, y=lat,c=colors[i], label=labels[i])
    plt.scatter(x=test_df['lon'], y=test_df['lat'],c='gray', label="TEST")


    plt.title('Mappings of different cruise regions')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.legend()

    plt.show()

# Create 3D mappings of the dataset using latitude, longitude, and year: 

In [None]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(12,10))
ax = fig.add_subplot(111, projection='3d')

df = jamstec_df


ax.scatter(df['lon'], df['lat'], df['year'], s=100, c='r', marker='o', label='JAMSTEC')
ax.scatter(test_df['lon'], test_df['lat'], test_df['year'], s=100, c='gray', marker='o', label='TEST')


ax.set_xlabel('Longitude', labelpad=20)
ax.set_ylabel('Latitude', labelpad=20)
ax.set_zlabel('Year', labelpad=20)
ax.dist = 10

plt.legend()
plt.show()

In [None]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(12,10))
ax = fig.add_subplot(111, projection='3d')

df = jamstec2_df


ax.scatter(df['lon'], df['lat'], df['year'], s=100, c='r', marker='o', label='JAMSTEC2')
ax.scatter(test_df['lon'], test_df['lat'], test_df['year'], s=100, c='gray', marker='o', label='TEST')


ax.set_xlabel('Longitude', labelpad=20)
ax.set_ylabel('Latitude', labelpad=20)
ax.set_zlabel('Year', labelpad=20)
ax.dist = 10

plt.legend()
plt.show()