# Week 5, Class 3: Data Cleaning, and Manipulation

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('experiment_data.csv')
df['Ratio'] = df['pH_Level']/df['Weight(g)']
print(df)

In [None]:
df = df.drop('Ratio', axis=1)
# del df['Ratio']
print(df)

In [None]:
# ~

## Task 1: Load the 'Experiments' sheet. How many rows and columns are there?

First, we need to load the data from the `Experiments` sheet. We can then use the .shape attribute to get the number of rows and columns.

In [None]:
# Load the data from the 'Experiments' sheet (CSV file)
experiments_df = pd.read_excel('data.xlsx', sheet_name='Experiments')

# Use the .shape attribute to get the number of rows and columns
rows, columns = experiments_df.shape
print(rows, columns)

## Task 2: Show the first 8 rows and the last 3 rows.

We can use the `.head()` and `.tail()` methods to view specific numbers of rows from the beginning and end of the DataFrame, respectively.

In [None]:
# Show the first 8 rows
first_8_rows = experiments_df.head(8)

# Show the last 3 rows
last_3_rows = experiments_df.tail(3)

print(first_8_rows)
print(last_3_rows)

## Task 3: What are the dtypes? Convert 'dose_mg' to float and 'date' to datetime.

The `.dtypes` attribute shows the data type of each column. We'll use `pd.to_numeric()` to handle the `dose_mg` column, which may contain non-numeric values, and `pd.to_datetime()` for the date column.

In [None]:
# Check original dtypes
print(experiments_df.dtypes)

# Convert 'dose_mg' to float
experiments_df['dose_mg'] = pd.to_numeric(experiments_df['dose_mg'])

# Convert 'date' to datetime
experiments_df['date'] = pd.to_datetime(experiments_df['date'])

# Check new dtypes
print()
print(experiments_df.dtypes)

## Task 4: How many missing values are there in each column? Fill missing 'response' with the group median per 'treatment'.

We'll use `.isnull().sum()` to count missing values. For filling the missing 'response' values, we'll apply a common strategy of using the median of the corresponding treatment group, which is a more robust approach than using the overall median.

In [None]:
# Count missing values in each column
missing_values_count = experiments_df.isnull().sum()

# Fill missing 'response' values with the median of their 'treatment' group
experiments_df['response'] = experiments_df['response'].fillna(
    experiments_df.groupby('treatment')['response'].transform('median')
)

print(f"Original missing values:\n{missing_values_count}\n\nMissing values after filling:\n{experiments_df.isnull().sum()}")

## Task 5: Strip whitespace and lowercase the 'species' values. Count unique species.

The `.str` accessor in Pandas allows us to apply string methods to an entire `Series`. We'll chain the `.str.strip()` and `.str.lower()` methods. Then, we can use `value_counts()` or `nunique()` to find the unique species.

In [None]:
# Strip whitespace and lowercase 'species' values
experiments_df['species'] = experiments_df['species'].str.strip().str.lower()

# Count the number of unique species
unique_species_count = experiments_df['species'].nunique()

# Show the counts for each unique species
species_counts = experiments_df['species'].value_counts()

print(f"Number of unique species: {unique_species_count}\n\nCounts of each unique species:\n{species_counts}")

## Task 6: From 'Experiments', compute mean and std of 'response' by 'treatment' and 'dose_mg'.

We'll use the powerful `.groupby()` method on two columns (`treatment` and `dose_mg`) and then use the `.agg()` method to apply both the mean and standard deviation functions to the `response` column.

In [None]:
# Group by 'treatment' and 'dose_mg', then aggregate 'response'
agg_results = experiments_df.groupby(['treatment', 'dose_mg'])['response'].agg(['mean', 'std'])

print(agg_results)

## Task 7: In 'Sensors_TimeSeries', resample each sensor to 30-minute means. Forward-fill gaps up to 2 periods.

This task requires us to work with time-series data. We'll first load the `Sensors_TimeSeries` data and convert the `timestamp_utc` column to a proper datetime object. Then, we can set it as the index and use `.resample()` to group the data into 30-minute intervals and calculate the mean. Finally, we'll use `.ffill(limit=2)` to forward-fill any small gaps.

In [None]:
# Load the 'Sensors_TimeSeries' data
sensors_df = pd.read_excel('data.xlsx', sheet_name='Sensors_TimeSeries')
print(sensors_df.head())

# Convert timestamp_utc to datetime
sensors_df['timestamp_utc'] = pd.to_datetime(sensors_df['timestamp_utc'])

# Set the timestamp as the index and resample to 30-minute means
resampled_data = sensors_df.set_index('timestamp_utc').groupby('sensor_id').resample('30min').mean()

# Forward-fill gaps up to 2 periods
resampled_data = resampled_data.ffill(limit=2)

print()
print(resampled_data.head())

## Task 8: Merge 'Gene_Expression_Wide' with 'Sample_Metadata' on sample_id (columns starting with 'S'). Bring 'condition' and 'batch' alongside expression values.

We need to merge the `Gene_Expression_Wide` and `Sample_Metadata` DataFrames on a common `sample_id` column. We'll use `pd.merge()` for this and specify the columns to merge on.

In [None]:
# Load the necessary sheets
gene_expression_df = pd.read_excel('data.xlsx', sheet_name='Gene_Expression_Wide')
sample_metadata_df = pd.read_excel('data.xlsx', sheet_name='Sample_Metadata')

# Merge the dataframes on the 'sample_id' columns
# Note: In 'Gene_Expression_Wide', the sample IDs are columns, so we need to transpose it first
# or melt it to a long format. A simpler approach is to merge the metadata into a temporary
# transposed version and then join. A much cleaner way is to first melt the gene expression data.
# For simplicity, we'll use a direct merge with a re-shaped dataframe.
# Let's adjust the sample_metadata to have 'sample_id' as the index for easier joining.
sample_metadata_df.set_index('sample_id', inplace=True)
gene_expression_df.rename(columns={'gene': 'gene_id'}, inplace=True)
gene_expression_df.set_index('gene_id', inplace=True)
merged_df = gene_expression_df.transpose().merge(sample_metadata_df[['condition', 'batch']], left_index=True, right_index=True)

print(merged_df.head())

## Task 9: From 'Assays_Long', pivot to wide with one row per (sample_id, day) and columns as assay_type with mean 'value'.

The `Assays_Long` data is in a long format. We need to reshape it into a wide format where each `assay_type` is its own column. Since we need to calculate the mean value for cases with duplicate `(sample_id, day, assay_type)` combinations, `pd.pivot_table()` is the appropriate function to use.

In [None]:
# Load the 'Assays_Long' data
assays_df = pd.read_excel('data.xlsx', sheet_name='Assays_Long')

# Pivot the data to wide format, computing the mean value
pivoted_df = assays_df.pivot_table(
    index=['sample_id', 'day'],
    columns='assay_type',
    values='value',
    aggfunc='mean'
)

print(pivoted_df.head())

## Task 10: Join 'Geo_Sites' to 'Experiments' using 'site_id'. Compute average response by region.

We will merge the `Geo_Sites` DataFrame with the cleaned `Experiments` DataFrame on the `site_id` column. After the merge, we can use `groupby()` to calculate the average response for each region.

In [None]:
# Load the 'Geo_Sites' data
geo_sites_df = pd.read_excel('data.xlsx', sheet_name='Geo_Sites')

# Merge the two dataframes on 'site_id'
experiments_with_geo = pd.merge(experiments_df, geo_sites_df, on='site_id', how='left')

# Compute the average response per region
average_response_by_region = experiments_with_geo.groupby('region')['response'].mean()

print(average_response_by_region)

## Task 11: In 'Inventory' and 'Inventory_Updates', perform a left join on 'sku'. Compute the new stock = stock + delta (NaN -> 0).

This task combines merging and handling missing values. We'll perform a left merge from `Inventory` to `Inventory_Updates`. The `delta` column in the merged DataFrame will have `NaN` values for items without updates. We will fill these `NaN`s with 0 before calculating the `new_stock`.

In [None]:
# Load the necessary sheets
inventory_df = pd.read_excel('data.xlsx', sheet_name='Inventory')
updates_df = pd.read_excel('data.xlsx', sheet_name='Inventory_Updates')

# Perform a left merge on 'sku'
inventory_merged = pd.merge(inventory_df, updates_df, on='sku', how='left')

# Fill NaN values in 'delta' with 0
inventory_merged['delta'] = inventory_merged['delta'].fillna(0)

# Compute the new stock
inventory_merged['new_stock'] = inventory_merged['stock'] + inventory_merged['delta']

print(inventory_merged.head())

## Task 12: Detect duplicated rows in 'Experiments' (full-row duplicates) and drop them keeping the first occurrence.

Pandas' `drop_duplicates()` method is perfect for this. It can detect and remove rows that are identical across all columns.

In [None]:
# Count original rows
original_rows = len(experiments_df)

# Drop duplicate rows, keeping the first occurrence
df_no_dupes = experiments_df.drop_duplicates()

# Count rows after dropping duplicates
rows_after_dupes = len(df_no_dupes)

print(f"Original number of rows: {original_rows}",
      f"Number of rows after dropping duplicates: {rows_after_dupes}", 
f"Dropped {original_rows - rows_after_dupes} duplicate rows.", sep='\n')

## Task 13: Parse the 'notes' column in 'Experiments' and extract any temperature pattern like 'T=23.5C' into a new column 'temp_C'.

We'll use the `.str.extract()` method with a regular expression to find and capture the temperature value. The regular expression `r'T=(\d+\.?\d*)C'` is designed to match a temperature pattern, capturing the number part, and it also handles cases where there's no decimal point.

In [None]:
# Extract the temperature pattern into a new column
experiments_df['temp_C'] = experiments_df['notes'].str.extract(r'T=(\d+\.?\d*)C')

# Convert the new column to a numeric type
experiments_df['temp_C'] = pd.to_numeric(experiments_df['temp_C'])

print(experiments_df[['notes', 'temp_C']].head())