In [1]:
# Ocean Pollution Data Cleaning & Processing

import pandas as pd
import numpy as np

# Load dataset
file_path = "data/ocean_plastic_pollution_data.csv"  # update path as needed
df = pd.read_csv(file_path)


In [2]:
# Show initial info and head
print("Initial Data Info:")
print(df.info())
print("\nSample rows:")
print(df.head())


Initial Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Date               15000 non-null  object 
 1   Region             15000 non-null  object 
 2   Latitude           15000 non-null  float64
 3   Longitude          15000 non-null  float64
 4   Plastic_Type       15000 non-null  object 
 5   Plastic_Weight_kg  15000 non-null  float64
 6   Depth_meters       15000 non-null  float64
dtypes: float64(4), object(3)
memory usage: 820.4+ KB
None

Sample rows:
              Date          Region   Latitude   Longitude  \
0  01/01/2015 0:00    Arctic Ocean -58.459627 -169.626456   
1  01/01/2015 1:00  Southern Ocean  49.449892   77.319852   
2  01/01/2015 2:00    Indian Ocean  80.422411  130.332581   
3  01/01/2015 3:00  Southern Ocean -71.920725   58.003563   
4  01/01/2015 4:00  Southern Ocean -61.993742 -169.011282   

 

In [3]:
# 1. Convert Date to datetime type
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# 2. Drop rows with missing essential data
df.dropna(subset=['Date', 'Region', 'Latitude', 'Longitude', 'Plastic_Type', 'Plastic_Weight_kg'], inplace=True)

# 3. Ensure Plastic_Weight_kg is numeric and non-negative
df['Plastic_Weight_kg'] = pd.to_numeric(df['Plastic_Weight_kg'], errors='coerce')
df = df[df['Plastic_Weight_kg'] >= 0]

# 4. Clean Plastic_Type (strip whitespace, lowercase for consistency)
df['Plastic_Type'] = df['Plastic_Type'].str.strip().str.lower()

# 5. Filter Latitude and Longitude to valid ranges
df = df[(df['Latitude'] >= -90) & (df['Latitude'] <= 90)]
df = df[(df['Longitude'] >= -180) & (df['Longitude'] <= 180)]

# 6. Depth_meters - convert to numeric, fill missing with median depth
df['Depth_meters'] = pd.to_numeric(df['Depth_meters'], errors='coerce')
median_depth = df['Depth_meters'].median()
df['Depth_meters'].fillna(median_depth, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Depth_meters'].fillna(median_depth, inplace=True)


In [4]:

# Summary statistics after cleaning
print("\nData Info After Cleaning:")
print(df.info())
print("\nSummary statistics:")
print(df.describe())

# Save cleaned data for further analysis
df.to_csv("ocean_plastic_pollution_data_cleaned.csv", index=False)

# Display the cleaned dataframe
df.head()



Data Info After Cleaning:
<class 'pandas.core.frame.DataFrame'>
Index: 6048 entries, 0 to 14903
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Date               6048 non-null   datetime64[ns]
 1   Region             6048 non-null   object        
 2   Latitude           6048 non-null   float64       
 3   Longitude          6048 non-null   float64       
 4   Plastic_Type       6048 non-null   object        
 5   Plastic_Weight_kg  6048 non-null   float64       
 6   Depth_meters       6048 non-null   float64       
dtypes: datetime64[ns](1), float64(4), object(2)
memory usage: 378.0+ KB
None

Summary statistics:
                                Date     Latitude    Longitude  \
count                           6048  6048.000000  6048.000000   
mean   2015-11-25 14:55:42.857142784     0.819876    -0.883596   
min              2015-01-01 00:00:00   -89.980158  -179.998007   
25%          

Unnamed: 0,Date,Region,Latitude,Longitude,Plastic_Type,Plastic_Weight_kg,Depth_meters
0,2015-01-01 00:00:00,Arctic Ocean,-58.459627,-169.626456,polyethylene terephthalate (pet),41.93,73.09
1,2015-01-01 01:00:00,Southern Ocean,49.449892,77.319852,polyethylene (pe),403.38,71.21
2,2015-01-01 02:00:00,Indian Ocean,80.422411,130.332581,polyethylene terephthalate (pet),241.7,19.53
3,2015-01-01 03:00:00,Southern Ocean,-71.920725,58.003563,polyethylene terephthalate (pet),482.76,69.72
4,2015-01-01 04:00:00,Southern Ocean,-61.993742,-169.011282,polyethylene terephthalate (pet),58.75,58.14
