<a href="https://colab.research.google.com/github/wamaw123/Biomedical_Data_analysis/blob/main/Week_1_Data_Importing_and_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set up the notebook for visualizations
%matplotlib inline

# Load the dataset from GitHub
url = "https://raw.githubusercontent.com/wamaw123/Biomedical_Data_analysis/c072fdafc2b2abe4e002f8611f80bcf5fd8366b8/Datasets/Week_1/week_1.csv"
data = pd.read_csv(url)

# Display the first few rows of the dataset to understand its structure
print(data.head())

# Conduct Descriptive Statistics

# Get a summary of the dataset
print("\nDataset Summary:")
print(data.describe())

# Check for missing values in the dataset
print("\nMissing Values:")
print(data.isnull().sum())

# Handle Missing Values

# For simplicity, we'll fill missing values with the mean of the respective column.
# However, depending on the nature of the data, other imputation methods might be more appropriate.
data.fillna(data.mean(), inplace=True)

# Check again for missing values to ensure they've been handled
print("\nMissing Values After Imputation:")
print(data.isnull().sum())

# Handle Outliers

# For this example, we'll use the IQR method to detect and handle outliers.
# Calculate IQR for each column
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1

# Define bounds for the outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Replace outliers with NaN
data_outliers_handled = data[~((data < lower_bound) | (data > upper_bound)).any(axis=1)]

# Display the shape of the data after handling outliers
print("\nData Shape After Handling Outliers:")
print(data_outliers_handled.shape)

# Data Visualization

# Plot histograms for each column to understand data distribution
data_outliers_handled.hist(figsize=(12, 10))
plt.suptitle("Histograms of Data Columns")
plt.show()

# Plot boxplots for each column to further inspect outliers and data distribution
plt.figure(figsize=(12, 10))
sns.boxplot(data=data_outliers_handled)
plt.title("Boxplots of Data Columns")
plt.show()

# Normalize and Standardize Data

# For this example, we'll use Min-Max normalization and Z-score standardization.

# Min-Max normalization
normalized_data = (data_outliers_handled - data_outliers_handled.min()) / (data_outliers_handled.max() - data_outliers_handled.min())

# Z-score standardization
standardized_data = (data_outliers_handled - data_outliers_handled.mean()) / data_outliers_handled.std()

# Display the first few rows of normalized and standardized data
print("\nNormalized Data:")
print(normalized_data.head())
print("\nStandardized Data:")
print(standardized_data.head())
