# Importing dependencies and creating initial dataframe

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

mouse_df = pd.merge(mouse_metadata, study_results, how='outer', on='Mouse ID')

mouse_df.head()

## Finding and removing repeat values

In [None]:
# Checking to see if any mouse is referenced more times than any other
mouse_df['Mouse ID'].value_counts()

In [None]:
# Mouse g989 has several repeated timepoints
mouse_df.loc[mouse_df['Mouse ID'] == 'g989', :]

In [None]:
# Creating a cleaned dataframe by removing mouse g989
mouse_cleaned = mouse_df.loc[mouse_df['Mouse ID'] != 'g989', :]

## Creating summary table by drug treatment

In [None]:
# Creating a .groupby object by treatment
drug_group = mouse_cleaned.groupby('Drug Regimen')

# Creating aggregated variables for summary dataframe
tumor_mean = drug_group['Tumor Volume (mm3)'].mean()
tumor_median = drug_group['Tumor Volume (mm3)'].median()
tumor_variance = drug_group['Tumor Volume (mm3)'].var()
tumor_std = drug_group['Tumor Volume (mm3)'].std()

# Creating a randomized sample to find standard error
tumor_sample = mouse_cleaned.sample(25)
tumor_sample_group = tumor_sample.groupby('Drug Regimen')
tumor_sem = st.sem(tumor_sample['Tumor Volume (mm3)'])

# Building summary dataframe from variables
drug_summary_table = pd.DataFrame(
{'Mean': tumor_mean, 
 'Median': tumor_median, 
 'Variance': tumor_variance, 
 'Standard Deviation': tumor_std, 
 'Standard Error': tumor_sem})

# Removing the index label and sorting by variance
drug_summary_table.index.name = None
drug_summary_table.sort_values(by=['Variance'])

## Creating bar charts (by drug treatment)

In [None]:
# Creating summary count table by treatment
drug_counts = drug_group['Mouse ID'].count()
drug_counts

In [None]:
# Using built-in Pandas bar plot on summary count table
drug_counts.plot(kind="bar")
plt.xlabel('Treatment')
plt.ylabel('# of Subjects')
plt.title('Total Subjects per Treatment')
plt.show()

In [None]:
# Creating axis variables from Pandas columns for Matplotlib bar plot
x_index = drug_counts.index
x_axis = [x for x in x_index]

y_axis = [x for x in drug_counts]

In [None]:
# Building and formatting Matplotlib bar plot
plt.bar(x_axis, y_axis)
plt.xlabel('Treatment')
plt.ylabel('# of Subjects')
plt.title('Total Subjects per Treatment')
plt.xticks(rotation=90)
plt.show()

## Creating pie charts (by sex)

In [None]:
# Creating summary count table by sex
sex_group = mouse_cleaned.groupby('Sex')
sex_counts = sex_group['Mouse ID'].count()

In [None]:
# Creating variables for labels and values from summary count table
labels = [x for x in sex_counts.index]
pie_values = [x for x in sex_counts]

In [None]:
# Building and formatting pie chart using Pandas built-in pie chart
sex_counts.plot.pie(autopct='%1.1f%%')
plt.title('Division of Subjects by Sex')
plt.ylabel('')
plt.show()

In [None]:
# Building and formatting pie chart using Matplotlib
plt.pie(pie_values, labels=labels, autopct="%1.1f%%")
plt.title('Division of Subjects by Sex')
plt.show()

## Calculating tumor volume across most promising treatments

In [None]:
# Creating .groupby object to organize by treatment and mouse
drug_mouse_group = mouse_cleaned.groupby(['Drug Regimen', 'Mouse ID'])

# Building tumor mass summary dataframe for reference
tumor_mass = pd.DataFrame(drug_mouse_group['Tumor Volume (mm3)'].sum())

# Building individual tumor mass dataframes by treatment
capomulin_tumor_mass = tumor_mass.loc[
['Capomulin']]
ramicane_tumor_mass = tumor_mass.loc[
['Ramicane']]
infubinol_tumor_mass = tumor_mass.loc[
['Infubinol']]
ceftamin_tumor_mass = tumor_mass.loc[
['Ceftamin']]

In [None]:
# Quantitative approach to finding Capomulin outliers
quartiles = capomulin_tumor_mass['Tumor Volume (mm3)'].quantile([.25,.5,.75])
lq = quartiles[.25]
uq = quartiles[.75]
iqr = uq - lq

cap_lb = lq-(1.5*iqr)
cap_ub = uq+(1.5*iqr)

# Building a dataframe of all Capomulin outliers
cap_outlier_df = capomulin_tumor_mass.loc[(capomulin_tumor_mass['Tumor Volume (mm3)'] < cap_lb) 
                                      | (capomulin_tumor_mass['Tumor Volume (mm3)'] > cap_ub), :]

# Displaying outliers in a dataframe
cap_outlier_df

In [None]:
# Quantitative approach to finding Ramicane outliers
quartiles = ramicane_tumor_mass['Tumor Volume (mm3)'].quantile([.25,.5,.75])
lq = quartiles[.25]
uq = quartiles[.75]
iqr = uq - lq

ram_lb = lq-(1.5*iqr)
ram_ub = uq+(1.5*iqr)

# Building a dataframe of all Ramicane outliers
ram_outlier_df = ramicane_tumor_mass.loc[(ramicane_tumor_mass['Tumor Volume (mm3)'] < ram_lb) 
                                      | (ramicane_tumor_mass['Tumor Volume (mm3)'] > ram_ub), :]

# Displaying outliers in a dataframe
ram_outlier_df

In [None]:
# Quantitative approach to finding Ifubinol outliers
quartiles = infubinol_tumor_mass['Tumor Volume (mm3)'].quantile([.25,.5,.75])
lq = quartiles[.25]
uq = quartiles[.75]
iqr = uq - lq

inf_lb = lq-(1.5*iqr)
inf_ub = uq+(1.5*iqr)

# Building a dataframe of all Infubinol outliers
inf_outlier_df = infubinol_tumor_mass.loc[(infubinol_tumor_mass['Tumor Volume (mm3)'] < inf_lb) 
                                      | (infubinol_tumor_mass['Tumor Volume (mm3)'] > inf_ub), :]

# Displaying outliers in a dataframe
inf_outlier_df

In [None]:
# Quantitative approach to finding Ceftamin outliers
quartiles = ceftamin_tumor_mass['Tumor Volume (mm3)'].quantile([.25,.5,.75])
lq = quartiles[.25]
uq = quartiles[.75]
iqr = uq - lq

cef_lb = lq-(1.5*iqr)
cef_ub = uq+(1.5*iqr)

# Building a dataframe of all Ceftamin outliers
cef_outlier_df = ceftamin_tumor_mass.loc[(ceftamin_tumor_mass['Tumor Volume (mm3)'] < cef_lb) 
                                      | (ceftamin_tumor_mass['Tumor Volume (mm3)'] > cef_ub), :]

# Displaying outliers in a dataframe
cef_outlier_df

## Boxplot

In [None]:
# Creating data, label, and marker variables
box_data = [capomulin_tumor_mass['Tumor Volume (mm3)'], ramicane_tumor_mass['Tumor Volume (mm3)'], 
            infubinol_tumor_mass['Tumor Volume (mm3)'], ceftamin_tumor_mass['Tumor Volume (mm3)']]
tick_labels = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']
markers = dict(marker='o', markerfacecolor='white', markersize=4)

# Building and formatting box and whisker plot
fig1, ax1 = plt.subplots()
ax1.boxplot(box_data, flierprops=markers)
ax1.set_xticklabels(tick_labels)
ax1.set_title('Comparison of Tumor Volume Distribution Across Most Promising Treatments')
ax1.set_xlabel('Treatment')
ax1.set_ylabel('Tumor Mass (in mm3)')
plt.show()

## Individual Mouse

In [None]:
# Creating a random sample generator to pick a mouse to be charted
sample_generator = mouse_cleaned.loc[mouse_cleaned['Drug Regimen'] == 'Capomulin', :].sample(1)
sample_generator

In [None]:
# Pulling all rows for the sampled mouse out of the master dataframe
sample_mouse = mouse_cleaned.loc[mouse_cleaned['Mouse ID'] == sample_generator.iloc[0,0], :]
sample_mouse

In [None]:
# Building line plot for sampled mouse
plt.plot(sample_mouse['Timepoint'], sample_mouse['Tumor Volume (mm3)'])
plt.xlabel('Timepoint')
plt.ylabel('Tumor Volume (mm3)')
plt.title(f"Treatment over time for mouse {sample_mouse.iloc[0,0]}")
plt.show()

## Scatterplot

In [None]:
# Creating an exclusive dataframe that only include Capomulin treated mice
capomulin_df = mouse_cleaned.loc[mouse_cleaned['Drug Regimen'] == 'Capomulin', :]

In [None]:
# Grouping mice by weight
weight_to_volume_group = capomulin_df.groupby('Weight (g)')

# Finding the average tumor volume for each weight group
weight_to_volume_df = pd.DataFrame(weight_to_volume_group['Tumor Volume (mm3)'].mean())

# Creating axis variables
weights = weight_to_volume_df.index
volumes = weight_to_volume_df.iloc[:,0]

In [None]:
# Building scatterplot
plt.scatter(weights, volumes)
plt.title('Weight to Average Tumor Mass')
plt.xlabel('Mouse Weight (g)')
plt.ylabel('Average Tumor Mass (mm3)')

# Unpacking linear regression returns into tuple
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(weights, volumes)

# Using line slope to create y-axis values for regression line
regress_values = weights * slope + intercept

# Plotting regression line over scatterplot
plt.plot(weights, regress_values, "r-")
plt.show()

# Displaying correlation coefficient
print(f"The correlation coefficient is {rvalue**2}")