## Observations and Insights 

In [None]:
%matplotlib notebook

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined_mouse_df = pd.merge(mouse_metadata,study_results,on="Mouse ID", how="outer")
combined_mouse_df.head()

In [None]:
# Checking the number of mice.
combined_mouse_df.count()

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
combined_mouse_df.duplicated(subset=["Mouse ID", "Timepoint"])

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
combined_mouse_df[combined_mouse_df.duplicated(["Mouse ID"])]

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
combined_mouse_df = combined_mouse_df.drop_duplicates(subset="Mouse ID", keep="last")
combined_mouse_df

In [None]:
# Checking the number of mice in the clean DataFrame.
#combined_mouse_df[combined_mouse_df.drop_duplicates(["Mouse ID"])]
combined_mouse_df.count()

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

In [None]:
#Summary statistics table of mean, median,variance,standard,deviation, and SEM
tumorstats = pd.DataFrame(combined_mouse_df.groupby("Drug Regimen").count())

#Use groupby to create summary stats by drug regime, add results into columns in summarystats
tumorstats["Mean"] = pd.DataFrame(combined_mouse_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].mean())
tumorstats["Median"] = pd.DataFrame(combined_mouse_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].median())
tumorstats["Standard Deviation"] = pd.DataFrame(combined_mouse_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].std())
tumorstats["Variance"] = pd.DataFrame(combined_mouse_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].var())
tumorstats["SEM"] = pd.DataFrame(combined_mouse_df.groupby("Drug Regimen")["Tumor Volume (mm3)"].sem())

#Columns
tumorstats = tumorstats[["Mouse ID", "Mean", "Median", "Standard Deviation", "Variance", "SEM"]]


tumorstats.reset_index()
tumorstats.head()

In [None]:
#Summary statistics table of mean, median,variance,standard,deviation, and SEM
#merged_grouped_df = combined_mouse_df.groupby(["Drug Regimen"])
#tumor_mean = combined_mouse_df.groupby('Drug Regimen').mean()['Tumor Volume (mm3)']
#tumor_mean = combined_mouse_df.groupby('Drug Regimen').median()['Tumor Volume (mm3)']
#tumor_mean = combined_mouse_df.groupby('Drug Regimen').std()['Tumor Volume (mm3)']
#tumor_mean = combined_mouse_df.groupby('Drug Regimen').var()['Tumor Volume (mm3)']
#tumor_mean = combined_mouse_df.groupby('Drug Regimen').sem()['Tumor Volume (mm3)']


#Convert to DataFrame
#tumor_df = pd.DataFrame(tumor_mean)


# DataFrame
#tumor_summary_df=tumor_df.copy()
#tumor_summary_df
#tumor_summary.reset_index()

In [None]:
#merged_grouped_df = combined_mouse_df.groupby(["Drug Regimen"])

#tumor_mean  = merged_grouped_df["Tumor Volume (mm3)"].mean()

#Convert to DataFrame
#tumor_df = pd.DataFrame(tumor_mean)


# DataFrame
#tumor_summary_df=tumor_df.copy()
#tumor_summary_df
#tumorstats.reset_index()


In [None]:
#len(tumorstats)

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pandas. 
#mice_points['Count'] = combined_mouse_df.groupby('Drug Regimen').count()['Tumor Volume (mm3)'].values
#mice_points
#mice_points.plot.bar('Drug Regimen','Count',alpha = 0.5)
#plt.show()

mice_counts =combined_mouse_df["Drug Regimen"].value_counts()
#x_axis = clean_df["Drug Regimen"].value_counts()
mice_counts.plot(kind="bar")
plt.xlabel("Drug Regimen")
plt.xticks(rotation=45)
plt.ylabel("No.of data points")
plt.show()

In [None]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pyplot.
#mice_counts =combined_mouse_df["Drug Regimen"].value_counts()
#mice_counts.plot(kind="bar")
#plt.xlabel("Drug Regimen")
#plt.xticks(rotation=45)
#plt.ylabel("No.of data points")
#plt.show()

pandas_plot = mice_counts.plot(kind="bar", figsize=(8,5))
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
sex_count = combined_mouse_df["Sex"].value_counts()
sex_count.head()


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot


## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 


In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
