# Pymaceuticals Inc.
Analysis
Add your analysis here.

In [10]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single DataFrame
mouse_study_merged = pd.merge(study_results, mouse_metadata, on=["Mouse ID"])

# Display the data table for preview
mouse_study_merged.head()


Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,b128,5,45.651331,0,Capomulin,Female,9,22
2,b128,10,43.270852,0,Capomulin,Female,9,22
3,b128,15,43.784893,0,Capomulin,Female,9,22
4,b128,20,42.731552,0,Capomulin,Female,9,22


In [11]:
# Checking the number of mice.
len(mouse_study_merged["Mouse ID"].unique())

249

In [12]:
# Our data should be uniquely identified by Mouse ID and Timepoint
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 

#mouse_study_merged[mouse_study_merged.duplicated(subset = ["Mouse ID", "Timepoint"], keep=False)]
mouse_duplicated = mouse_study_merged[mouse_study_merged.duplicated(subset = ["Mouse ID", "Timepoint"])]["Mouse ID"].unique()
mouse_duplicated

array(['g989'], dtype=object)

In [13]:
# Optional: Get all the data for the duplicate mouse ID. 
mouse_dupe_data = mouse_study_merged[mouse_study_merged["Mouse ID"] == "g989"]
mouse_dupe_data.head(15)

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
860,g989,0,45.0,0,Propriva,Female,21,26
861,g989,0,45.0,0,Propriva,Female,21,26
862,g989,5,48.786801,0,Propriva,Female,21,26
863,g989,5,47.570392,0,Propriva,Female,21,26
864,g989,10,51.745156,0,Propriva,Female,21,26
865,g989,10,49.880528,0,Propriva,Female,21,26
866,g989,15,51.325852,1,Propriva,Female,21,26
867,g989,15,53.44202,0,Propriva,Female,21,26
868,g989,20,55.326122,1,Propriva,Female,21,26
869,g989,20,54.65765,1,Propriva,Female,21,26


In [16]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.

# removing duplicate mouse by its ID
mouse_study = mouse_study_merged.loc[mouse_study_merged["Mouse ID"] != "g989", :]

# print latest dataframe
mouse_study.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,b128,5,45.651331,0,Capomulin,Female,9,22
2,b128,10,43.270852,0,Capomulin,Female,9,22
3,b128,15,43.784893,0,Capomulin,Female,9,22
4,b128,20,42.731552,0,Capomulin,Female,9,22


In [17]:
# Checking the number of mice in the clean DataFrame.
len(mouse_study["Mouse ID"].unique())

248

## Summary Statistics

In [8]:
mouse_study.columns


Index(['Mouse ID', 'Drug Regimen', 'Sex', 'Age_months', 'Weight (g)',
       'Timepoint', 'Tumor Volume (mm3)', 'Metastatic Sites'],
      dtype='object')

In [18]:
mouse_study.describe()

Unnamed: 0,Timepoint,Tumor Volume (mm3),Metastatic Sites,Age_months,Weight (g)
count,1880.0,1880.0,1880.0,1880.0,1880.0
mean,19.606383,50.435293,1.025,12.757979,25.660106
std,14.094238,8.914197,1.139971,7.181998,3.935064
min,0.0,22.050126,0.0,1.0,15.0
25%,5.0,45.0,0.0,7.0,25.0
50%,20.0,48.933454,1.0,13.0,27.0
75%,30.0,56.324075,2.0,19.25,29.0
max,45.0,78.567014,4.0,24.0,30.0


In [42]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary DataFrame.

tumor_vol_per = mouse_study.groupby(["Drug Regimen"])
# print(tumor_vol_per)
# tumor_vol_per.count().head(10)

#drug_stats_index = mouse_study.set_index(["Drug Regimen"])
#drug_stats = mouse_study.groupby(["Drug Regimen"])
#drug_stats.set_index("Drug Regimen", append=True).loc[:, drug_stats["Drug Regimen"].unique(), :].reset_index(level=1)
avg_tumor_vol = tumor_vol_per["Tumor Volume (mm3)"].mean()
med_tumor_vol = tumor_vol_per["Tumor Volume (mm3)"].median()
var_tumor_vol = tumor_vol_per["Tumor Volume (mm3)"].var()
std_tumor_vol = tumor_vol_per["Tumor Volume (mm3)"].std()
sem_tumor_vol = tumor_vol_per["Tumor Volume (mm3)"].sem()

stats = pd.DataFrame({"Mean Tumor Volume": [avg_tumor_vol], "Median Tumor Volume": [med_tumor_vol],
                      "Tumor Volume Variance" : [var_tumor_vol], "Tumor Volume Std.Dev.": [std_tumor_vol],
                      "Tumor Volume Std.Err.": [sem_tumor_vol]})
stats_summary = stats.groupby(["Drug Regimen"])
# stats_summary = stats.groupby(["Drug Regimen"]).size()
# #stats_summary.set_index("Drug Regimen", append=True).loc[:, stats_summary["Drug Regimen"].unique(), :].reset_index(level=1)
print(stats_summary)
stats_summary.sum()

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x00000219DEE5BBE0>


Unnamed: 0,Mean Tumor Volume,Median Tumor Volume,Tumor Volume Variance,Tumor Volume Std.Dev.,Tumor Volume Std.Err.
Drug Regimen,Drug Regimen Capomulin 40.675741 Ceftamin ...,Drug Regimen Capomulin 41.557809 Ceftamin ...,Drug Regimen Capomulin 24.947764 Ceftamin ...,Drug Regimen Capomulin 4.994774 Ceftamin ...,Drug Regimen Capomulin 0.329346 Ceftamin ...


In [37]:
print(stats)


                                   Mean Tumor Volume  \
0  Drug Regimen
Capomulin    40.675741
Ceftamin  ...   

                                 Median Tumor Volume  \
0  Drug Regimen
Capomulin    41.557809
Ceftamin  ...   

                               Tumor Volume Variance  \
0  Drug Regimen
Capomulin    24.947764
Ceftamin  ...   

                               Tumor Volume Std.Dev.  \
0  Drug Regimen
Capomulin    4.994774
Ceftamin   ...   

                               Tumor Volume Std.Err.  
0  Drug Regimen
Capomulin    0.329346
Ceftamin   ...  


## Bar and Pie Charts

In [None]:
 #Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using Pandas.
    
#mouse_study.columns
#mouse_study.dtypes

x_axis = np.arange(len([Timepoint]))


#plt.bar(x_axis,[Timepoint])