# Data Cleaning and Preparation

#### Question 1: Import appropriate package and load the dataset
Using pandas load the *black_sea_water.csv*

In [6]:
import pandas as pd
import numpy as np
file_path = 'black_sea_water.csv'
df = pd.read_csv(file_path)

#### Question 2: Complete the 'handle_missing_values' function to handle missing values in the dataset. 

Iterate each column of the dataframe **df_filled**. Using **fillna()** fill the ***NaN*** values in each column by replacing it with the average (mean) value if it's a numeric column or with the string "NA" if it's a text column.

*Hint* : You can use numpy to detect if the column's data type is a number with the following condition:
-  **if np.issubdtype(df_filled[column].dtype, np.number)**


In [7]:
# Function to handle missing values for both text and numeric columns

def handle_missing_values(df):
    df_filled = df.copy()

    for column in df_filled.columns: 
        if np.issubdtype(df_filled[column].dtype, np.number):
            mean_value = round(df_filled[column].mean(), 2)
            df_filled[column] = df_filled[column].fillna(mean_value)
        else:
            df_filled[column] = df_filled[column].fillna('NA')
    
    return df_filled

In [8]:
#Call the function and handle missing values for df and print the first rows

df_filled_na = handle_missing_values(df)
df_filled_na.head(10)

Unnamed: 0,Event,Date/Time,Latitude,Longitude,Station,Depth water [m],Temp [°C],Sal,DO,[PO4]3-,[NO2]-,[NO3]-,Chl
0,Aegaeo_2013-10_AMT-2,2013-10-10,39.25,25.446,AMT-2,3,19.7,34.15,4.53,0.015,0.01,0.42,0.12
1,Aegaeo_2013-10_AMT-2,2013-10-10,39.25,25.446,AMT-2,10,19.75,34.36,4.86,0.03,0.01,0.18,0.13
2,Aegaeo_2013-10_AMT-2,2013-10-10,39.25,25.446,AMT-2,20,21.12,35.53,4.87,0.016,0.01,0.16,0.18
3,Aegaeo_2013-10_AMT-2,2013-10-10,39.25,25.446,AMT-2,30,19.8,37.05,5.12,0.015,0.02,0.15,0.22
4,Aegaeo_2013-10_AMT-2,2013-10-10,39.25,25.446,AMT-2,50,17.54,38.76,5.46,0.04,0.02,0.21,0.22
5,Aegaeo_2013-10_AMT-2,2013-10-10,39.25,25.446,AMT-2,75,16.84,39.07,5.23,0.027,0.15,0.26,0.14
6,Aegaeo_2013-10_AMT-3,2013-10-10,39.108,25.435,AMT-3,3,19.73,34.57,5.23,0.008,0.01,0.12,0.1
7,Aegaeo_2013-10_AMT-3,2013-10-10,39.108,25.435,AMT-3,10,20.05,35.42,5.24,0.013,0.01,0.1,0.11
8,Aegaeo_2013-10_AMT-3,2013-10-10,39.108,25.435,AMT-3,20,19.42,37.41,5.12,0.011,0.01,0.11,0.12
9,Aegaeo_2013-10_AMT-3,2013-10-10,39.108,25.435,AMT-3,30,20.0,38.48,5.44,0.021,0.01,0.24,0.2


#### Question 3: Complete the 'handle_outliers' function to handle outliers in the dataset
Same as Question 2, iterate each column of the dataframe, detect if the column's data type is a number, if it is then identify outliers using the interquartile range (IQR) method teached in class.  

In [11]:
# Function to handle outliers for numeric columns

def handle_outliers(df):
    df_outliers_removed = df.copy()

    for column in df_outliers_removed.columns:
        if np.issubdtype(df_outliers_removed[column].dtype, np.number):
            q1 = df_outliers_removed[column].quantile(0.25)
            q3 = df_outliers_removed[column].quantile(0.75)
            iqr = q3 - q1

            lower_bound = q1 - 1.5 * iqr
            upper_bound = q3 + 1.5 * iqr

            df_outliers_removed = df_outliers_removed[
                                    (df_outliers_removed[column] >= lower_bound) 
                                    & (df_outliers_removed[column] <= upper_bound)]

    return df_outliers_removed

In [13]:
#Call the function and handle outliers values for df and print the first rows

df_without_outliers = handle_outliers(df)
df_without_outliers.head()

Unnamed: 0,Event,Date/Time,Latitude,Longitude,Station,Depth water [m],Temp [°C],Sal,DO,[PO4]3-,[NO2]-,[NO3]-,Chl
1,Aegaeo_2013-10_AMT-2,2013-10-10,39.25,25.446,AMT-2,10,19.75,34.36,4.86,0.03,0.01,0.18,0.13
3,Aegaeo_2013-10_AMT-2,2013-10-10,39.25,25.446,AMT-2,30,19.8,37.05,5.12,0.015,0.02,0.15,0.22
6,Aegaeo_2013-10_AMT-3,2013-10-10,39.108,25.435,AMT-3,3,19.73,34.57,5.23,0.008,0.01,0.12,0.1
7,Aegaeo_2013-10_AMT-3,2013-10-10,39.108,25.435,AMT-3,10,20.05,35.42,5.24,0.013,0.01,0.1,0.11
8,Aegaeo_2013-10_AMT-3,2013-10-10,39.108,25.435,AMT-3,20,19.42,37.41,5.12,0.011,0.01,0.11,0.12


#### Question 4: Complete the 'handle_duplicates' function to remove duplicates in the dataset
Drop duplicates from the **df** dataframe and save it in the **df_deduplicated** variable. Print the shape of the original dataset and the shape of the Deduplicated dataset to compare.

In [18]:
# Function to handle duplicates
def handle_duplicates(df):
    df_deduplicated = df.drop_duplicates()
    print("Original DataFrame shape:", df.shape)
    print("Deduplicated DataFrame shape:", df_deduplicated.shape)

    return df_deduplicated

In [19]:
#Call the function and handle duplicates in df

df_deduplicated = handle_duplicates(df)

Original DataFrame shape: (79, 13)
Deduplicated DataFrame shape: (72, 13)


#### Question 5: Complete the 'standardize_data' function to standardizes the 'Species' column in the dataset
Using the following dataset, standarize the *Species* column.

In [21]:
data = {
    'Species': [' blue whale ', 'great White shark', 'dolphin', 'BLUE whale', 'doLPHin'],
    'Count': [5, 2, 13, 4, 11]
}
df_ocean_species = pd.DataFrame(data)

In [22]:
# Function to address inconsistency and standardize data

def standardize_data(df):
    df_standardized = df.copy()
    df_standardized['Species'] = df_standardized['Species'].str.lower().str.strip().replace(
        {'blue whale': 'Blue Whale',
         'great white shark': 'Great White Shark',
         'dolphin': 'Dolphin'}
    )
    return df_standardized

In [23]:
# Standardize the species names
df_standardized_species = standardize_data(df_ocean_species)

print("Original Dataframe:\n", df_ocean_species)
print("\nStandardized DataFrame:\n", df_standardized_species)

# Display the original and the standardized DataFrame
#Your code goes here

Original Dataframe:
              Species  Count
0        blue whale       5
1  great White shark      2
2            dolphin     13
3         BLUE whale      4
4            doLPHin     11

Standardized DataFrame:
              Species  Count
0         Blue Whale      5
1  Great White Shark      2
2            Dolphin     13
3         Blue Whale      4
4            Dolphin     11
