In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
import matplotlib.pyplot as plt
import seaborn as sns

# Create a Jupyter Notebook
nb = nbf.v4.new_notebook()

# Add a title cell
nb.cells.append(nbf.v4.new_markdown_cell("# Data Science Masters Assignment"))

# Add an introduction cell
nb.cells.append(nbf.v4.new_markdown_cell("## Feature Engineering-1"))

# Q1: Missing Values in a Dataset
nb.cells.append(nbf.v4.new_markdown_cell("### Q1: What are missing values in a dataset? Why is it essential to handle missing values? Name some algorithms that are not affected by missing values."))
nb.cells.append(nbf.v4.new_markdown_cell("""
Missing values are the data points that are not stored in the dataset due to various reasons such as data corruption, data entry errors, or data unavailability. 

It is essential to handle missing values because they can lead to incorrect analysis and misleading conclusions. Algorithms such as decision trees and k-nearest neighbors (KNN) are not affected by missing values.
"""))

# Q2: Techniques to Handle Missing Data
nb.cells.append(nbf.v4.new_markdown_cell("### Q2: List down techniques used to handle missing data. Give an example of each with python code."))
nb.cells.append(nbf.v4.new_code_cell("""
# Importing libraries
import pandas as pd
import numpy as np

# Creating a sample dataframe with missing values
data = {'A': [1, 2, np.nan, 4, 5], 'B': [5, np.nan, np.nan, 8, 10], 'C': [10, 11, 12, 13, np.nan]}
df = pd.DataFrame(data)

# Technique 1: Removing rows with missing values
df_dropped = df.dropna()
print("Data after removing rows with missing values:\\n", df_dropped)

# Technique 2: Imputing missing values with mean
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
print("Data after imputing missing values with mean:\\n", df_imputed)
"""))

# Q3: Explain the imbalanced data
nb.cells.append(nbf.v4.new_markdown_cell("### Q3: Explain the imbalanced data. What will happen if imbalanced data is not handled?"))
nb.cells.append(nbf.v4.new_markdown_cell("""
Imbalanced data occurs when the classes in a dataset are not represented equally. For example, in a binary classification problem, if one class constitutes 90% of the data and the other class only 10%, the data is imbalanced. 

If imbalanced data is not handled, the machine learning model may become biased towards the majority class and perform poorly on the minority class.
"""))

# Q4: Up-sampling and Down-sampling
nb.cells.append(nbf.v4.new_markdown_cell("### Q4: What are Up-sampling and Down-sampling? Explain with an example when up-sampling and down-sampling are required."))
nb.cells.append(nbf.v4.new_code_cell("""
# Importing libraries
from sklearn.utils import resample

# Creating a sample dataset
data = {'feature': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'class': [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]}
df = pd.DataFrame(data)

# Down-sampling majority class
df_majority = df[df['class'] == 0]
df_minority = df[df['class'] == 1]
df_majority_downsampled = resample(df_majority, replace=False, n_samples=len(df_minority), random_state=42)
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
print("Data after down-sampling:\\n", df_downsampled)

# Up-sampling minority class
df_minority_upsampled = resample(df_minority, replace=True, n_samples=len(df_majority), random_state=42)
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
print("Data after up-sampling:\\n", df_upsampled)
"""))

# Q5: Data Augmentation and SMOTE
nb.cells.append(nbf.v4.new_markdown_cell("### Q5: What is data Augmentation? Explain SMOTE."))
nb.cells.append(nbf.v4.new_markdown_cell("""
Data augmentation is a technique to increase the diversity of training data without collecting new data by applying random transformations like rotation, flipping, etc.

SMOTE (Synthetic Minority Over-sampling Technique) is a technique to generate synthetic samples for the minority class to balance the class distribution.
"""))

# Q6: Outliers in a Dataset
nb.cells.append(nbf.v4.new_markdown_cell("### Q6: What are outliers in a dataset? Why is it essential to handle outliers?"))
nb.cells.append(nbf.v4.new_markdown_cell("""
Outliers are data points that significantly differ from other observations. They can skew the results and affect the performance of machine learning models. Handling outliers is essential to improve the accuracy and reliability of the analysis.
"""))

# Q7: Techniques to Handle Missing Data in Analysis
nb.cells.append(nbf.v4.new_markdown_cell("### Q7: You are working on a project that requires analyzing customer data. However, you notice that some of the data is missing. What are some techniques you can use to handle the missing data in your analysis?"))
nb.cells.append(nbf.v4.new_markdown_cell("""
Techniques to handle missing data:
1. Remove rows/columns with missing values
2. Impute missing values with mean/median/mode
3. Use machine learning algorithms that can handle missing values
4. Predict missing values using other features
"""))

# Q8: Strategies to Determine Missing Data Patterns
nb.cells.append(nbf.v4.new_markdown_cell("### Q8: You are working with a large dataset and find that a small percentage of the data is missing. What are some strategies you can use to determine if the missing data is missing at random or if there is a pattern to the missing data?"))
nb.cells.append(nbf.v4.new_markdown_cell("""
Strategies to determine missing data patterns:
1. Visualize missing data with heatmaps or missing data matrices
2. Perform statistical tests to analyze the randomness of missing data
3. Check correlation between missing data and other features
"""))

# Q9: Evaluating Model Performance on Imbalanced Data
nb.cells.append(nbf.v4.new_markdown_cell("### Q9: Suppose you are working on a medical diagnosis project and find that the majority of patients in the dataset do not have the condition of interest, while a small percentage do. What are some strategies you can use to evaluate the performance of your machine learning model on this imbalanced dataset?"))
nb.cells.append(nbf.v4.new_markdown_cell("""
Strategies to evaluate model performance on imbalanced data:
1. Use precision-recall curve instead of ROC curve
2. Calculate F1-score, which considers both precision and recall
3. Use confusion matrix to understand the true positives, false positives, true negatives, and false negatives
"""))

# Q10: Methods to Balance Dataset by Down-sampling
nb.cells.append(nbf.v4.new_markdown_cell("### Q10: When attempting to estimate customer satisfaction for a project, you discover that the dataset is unbalanced, with the bulk of customers reporting being satisfied. What methods can you employ to balance the dataset and down-sample the majority class?"))
nb.cells.append(nbf.v4.new_markdown_cell("""
Methods to balance dataset by down-sampling:
1. Random under-sampling: Randomly remove samples from the majority class
2. Cluster-based under-sampling: Use clustering algorithms to identify and remove redundant samples from the majority class
"""))

# Q11: Methods to Balance Dataset by Up-sampling
nb.cells.append(nbf.v4.new_markdown_cell("### Q11: You discover that the dataset is unbalanced with a low percentage of occurrences while working on a project that requires you to estimate the occurrence of a rare event. What methods can you employ to balance the
