### Load all datasets


In [4]:
import pandas as pd

integrated_og_combined_color_count = pd.read_csv('../outputs/integrated_ocean_plastics.csv')
integrated_og_combined_color_count.describe()

Unnamed: 0,Year,mp/kg dw,log_concentration
count,2597.0,2729.0,2729.0
mean,2018.641894,13735.35,0.852433
std,0.500755,705134.3,0.829072
min,2013.0,0.0,0.0
25%,2018.0,0.7231181,0.236315
50%,2019.0,3.744196,0.676163
75%,2019.0,18.75361,1.295646
max,2019.0,36836030.0,7.566273


In [11]:
xb_color_count = pd.read_csv("../data/Xialiao Beach color count.csv")
lb_color_count = pd.read_csv("../data/Longmen Beach color count.csv")

### Combine colors sets and save as a new CSV file

In [13]:
combined_color_count = pd.concat([xb_color_count, lb_color_count], ignore_index=True)

# Check the shape of the combined dataframe
print(f"Xialiao Beach shape: {xb_color_count.shape}")
print(f"Longmen Beach shape: {lb_color_count.shape}")
print(f"Combined dataframe shape: {combined_color_count.shape}")

# Preview the combined dataframe
combined_color_count.head()

Xialiao Beach shape: (966, 18)
Longmen Beach shape: (1626, 18)
Combined dataframe shape: (2592, 18)


Unnamed: 0,Date_YYYY-MM-DD,Country_Region,Location_name,Location_lat,Location_lon,Transect,Position,Size_min_mm,Size_max_mm,Size_class,no_color,black,grey,red_pink,orange_brown_yellow,green,blue,purple
0,2018-04-25,Taiwan,Xialiao_Beach,25.21469,121.65406,A,1,1,5,microplastics,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2018-04-25,Taiwan,Xialiao_Beach,25.21469,121.65406,A,2,1,5,microplastics,36.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2018-04-25,Taiwan,Xialiao_Beach,25.21469,121.65406,A,3,1,5,microplastics,76.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0
3,2018-04-25,Taiwan,Xialiao_Beach,25.21469,121.65406,A,4,1,5,microplastics,56.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,2018-04-25,Taiwan,Xialiao_Beach,25.21469,121.65406,A,5,1,5,microplastics,179.0,0.0,0.0,1.0,1.0,2.0,1.0,0.0


In [14]:
# Save the combined dataframe to a new CSV file
combined_color_count.to_csv('../outputs/combined_beach_color_count.csv', index=False)

### Display dataset info


In [16]:
print("Original dataset info:")
print(f"Number of rows: {len(combined_color_count)}")
print(f"Number of columns: {len(combined_color_count.columns)}")
print(f"Columns: {', '.join(combined_color_count.columns)}")
print()

Original dataset info:
Number of rows: 2592
Number of columns: 18
Columns: Date_YYYY-MM-DD, Country_Region, Location_name, Location_lat, Location_lon, Transect, Position, Size_min_mm, Size_max_mm, Size_class, no_color, black, grey, red_pink, orange_brown_yellow, green, blue, purple



### Create a new column for "Dominant Color"

In [17]:
combined_color_count["Dominant Color"] = "unknown"
print("Step 1: Added 'Dominant Color' column with default value 'unknown'")

Step 1: Added 'Dominant Color' column with default value 'unknown'


### Define a function to find the dominant color for each row


In [18]:
def find_dominant_color(row):
    """
    Find the column with the highest numerical value among the color columns.
    If the highest value is 0, keep "unknown".
    If the column is "no_color", return "transparent".
    Otherwise, return the name of the column.
    """
    # List of color columns to check
    color_cols = ['no_color', 'black', 'grey', 'red_pink',
                 'orange_brown_yellow', 'green', 'blue', 'purple']

    # Extract the color values for this row
    color_values = [row[col] for col in color_cols]

    # Find the maximum value and its index
    max_val = max(color_values)
    max_idx = color_values.index(max_val)

    # If the maximum value is 0, keep "unknown"
    if max_val == 0:
        return "unknown"

    # Get the name of the column with the maximum value
    max_col = color_cols[max_idx]

    # Replace "no_color" with "transparent"
    if max_col == "no_color":
        return "transparent"
    else:
        return max_col

### Apply the function to each row and drop the numerical color columns

In [19]:
combined_color_count["Dominant Color"] = combined_color_count.apply(find_dominant_color, axis=1)
print("Step 2: Applied function to determine the dominant color for each row")

# Display information about the dominant color transformation
print("\nDominant color distribution:")
print(combined_color_count["Dominant Color"].value_counts())
print()

# Step 3: Drop the numerical color columns
color_cols = ['no_color', 'black', 'grey', 'red_pink',
             'orange_brown_yellow', 'green', 'blue', 'purple']

combined_color_count = combined_color_count.drop(columns=color_cols)
print("Step 3: Dropped numerical color columns")

Step 2: Applied function to determine the dominant color for each row

Dominant color distribution:
Dominant Color
transparent            1833
unknown                 650
black                    26
green                    22
grey                     18
orange_brown_yellow      15
red_pink                 14
blue                     13
purple                    1
Name: count, dtype: int64

Step 3: Dropped numerical color columns


### Display information about the transformations and save the modified dataset

In [20]:
print("\nFinal dataset info:")
print(f"Number of rows: {len(combined_color_count)}")
print(f"Number of columns: {len(combined_color_count.columns)}")
print(f"Columns: {', '.join(combined_color_count.columns)}")
print()

# Save the modified dataset to a CSV file
output_file = "beach_data_with_dominant_color.csv"
combined_color_count.to_csv(output_file, index=False)
print(f"Step 4: Modified dataset saved to '{output_file}'")

# Print a sample of the transformed data
print("\nSample of transformed data (first 5 rows):")
print(combined_color_count.head(5).to_string())



Final dataset info:
Number of rows: 2592
Number of columns: 11
Columns: Date_YYYY-MM-DD, Country_Region, Location_name, Location_lat, Location_lon, Transect, Position, Size_min_mm, Size_max_mm, Size_class, Dominant Color

Step 4: Modified dataset saved to 'beach_data_with_dominant_color.csv'

Sample of transformed data (first 5 rows):
  Date_YYYY-MM-DD Country_Region  Location_name  Location_lat  Location_lon Transect  Position  Size_min_mm  Size_max_mm     Size_class Dominant Color
0      2018-04-25         Taiwan  Xialiao_Beach      25.21469     121.65406        A         1            1            5  microplastics    transparent
1      2018-04-25         Taiwan  Xialiao_Beach      25.21469     121.65406        A         2            1            5  microplastics    transparent
2      2018-04-25         Taiwan  Xialiao_Beach      25.21469     121.65406        A         3            1            5  microplastics    transparent
3      2018-04-25         Taiwan  Xialiao_Beach      25.21