## Observations and Insights 

In [177]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mouse_merge_df = mouse_metadata.merge(study_results, on = 'Mouse ID', how = 'outer')

In [178]:
mouse_metadata

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16
...,...,...,...,...,...
244,z314,Stelasyn,Female,21,28
245,z435,Propriva,Female,12,26
246,z581,Infubinol,Female,24,25
247,z795,Naftisol,Female,13,29


In [179]:
mouse_merge_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [180]:
study_results

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.000000,0
1,f932,0,45.000000,0
2,g107,0,45.000000,0
3,a457,0,45.000000,0
4,c819,0,45.000000,0
...,...,...,...,...
1888,r944,45,41.581521,2
1889,u364,45,31.023923,3
1890,p438,45,61.433892,1
1891,x773,45,58.634971,4


In [225]:
# Checking the number of mice in the DataFrame.
#there are no duplicate mice in the dataset because this number matches the original dataset.

number_mice = len(mouse_merge_df['Mouse ID'].unique())
print(number_mice)

249


In [227]:
mouse_merge_df['Mouse ID'][0:5]

0    k403
1    k403
2    k403
3    k403
4    k403
Name: Mouse ID, dtype: object

In [228]:
mouse_merge_df['Timepoint'][0:5]

0     0
1     5
2    10
3    15
4    20
Name: Timepoint, dtype: int64

In [231]:
mouse_merge_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [234]:
mouse_merge_df['duplicate'] = mouse_merge_df.duplicated(subset=['Mouse ID', 'Timepoint'])

In [236]:
mouse_merge_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,duplicate
0,k403,Ramicane,Male,21,16,0,45.000000,0,False
1,k403,Ramicane,Male,21,16,5,38.825898,0,False
2,k403,Ramicane,Male,21,16,10,35.014271,1,False
3,k403,Ramicane,Male,21,16,15,34.223992,1,False
4,k403,Ramicane,Male,21,16,20,32.997729,1,False
...,...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2,False
1889,z969,Naftisol,Male,9,30,30,65.841013,3,False
1890,z969,Naftisol,Male,9,30,35,69.176246,4,False
1891,z969,Naftisol,Male,9,30,40,70.314904,4,False


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,duplicate
909,g989,Propriva,Female,21,26,0,45.0,0,True
911,g989,Propriva,Female,21,26,5,47.570392,0,True
913,g989,Propriva,Female,21,26,10,49.880528,0,True
915,g989,Propriva,Female,21,26,15,53.44202,0,True
917,g989,Propriva,Female,21,26,20,54.65765,1,True


In [238]:
# Optional: Get all the data for the duplicate mouse ID. 
mouse_merge_df.loc[mouse_merge_df['duplicate']== True]

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,duplicate
909,g989,Propriva,Female,21,26,0,45.0,0,True
911,g989,Propriva,Female,21,26,5,47.570392,0,True
913,g989,Propriva,Female,21,26,10,49.880528,0,True
915,g989,Propriva,Female,21,26,15,53.44202,0,True
917,g989,Propriva,Female,21,26,20,54.65765,1,True


In [241]:
duplicate_rows = mouse_merge_df.loc[mouse_merge_df['duplicate']== True]
duplicate_rows.index

Int64Index([909, 911, 913, 915, 917], dtype='int64')

In [242]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
mouse_merge_clean_df = mouse_merge_df.drop(duplicate_rows.index)

In [243]:
# Checking the number of mice in the clean DataFrame.
mouse_merge_clean_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,duplicate
0,k403,Ramicane,Male,21,16,0,45.000000,0,False
1,k403,Ramicane,Male,21,16,5,38.825898,0,False
2,k403,Ramicane,Male,21,16,10,35.014271,1,False
3,k403,Ramicane,Male,21,16,15,34.223992,1,False
4,k403,Ramicane,Male,21,16,20,32.997729,1,False
...,...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2,False
1889,z969,Naftisol,Male,9,30,30,65.841013,3,False
1890,z969,Naftisol,Male,9,30,35,69.176246,4,False
1891,z969,Naftisol,Male,9,30,40,70.314904,4,False


In [251]:
#unique mice.  Same as the number found above.
len(mouse_merge_clean_df['Mouse ID'].unique())

249

## Summary Statistics

In [273]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method is the most straightforward, creating multiple series and putting them all together at the end.

drug_list = mouse_merge_clean_df['Drug Regimen'].unique()
drug_stats = {}

for drug in drug_list:
    tumor_value_list = mouse_merge_clean_df.loc[mouse_merge_clean_df['Drug Regimen'] \
                                        == drug,:].loc[:,'Tumor Volume (mm3)']
    mean = tumor_value_list.mean()
    median = tumor_value_list.median()
    variance = tumor_value_list.var()
    standard_deviation = tumor_value_list.std()
    sem = tumor_value_list.sem()
    drug_stats[drug] = [mean, median, variance, standard_deviation, sem]

In [284]:
#a dictionary of regimen with results as [mean, median, variance, standard_deviation, sem]
drug_stats

{'Ramicane': [40.2167450667105,
  40.67323554,
  23.486703952095255,
  4.846308280753016,
  0.3209546065084816],
 'Capomulin': [40.67574114100001,
  41.557808879999996,
  24.947764120254856,
  4.9947736805840215,
  0.32934562340083096],
 'Infubinol': [52.88479510859551,
  51.82058438,
  43.12868412883606,
  6.5672432670669405,
  0.4922356938011383],
 'Placebo': [54.03358078635358,
  52.28893409,
  61.16808293669701,
  7.821002681031187,
  0.5813305510593875],
 'Ceftamin': [52.59117180960677,
  51.77615728000001,
  39.2901772732786,
  6.268187718414199,
  0.469820532752611],
 'Stelasyn': [54.23314911988949,
  52.43173664,
  59.45056167336598,
  7.7104190335782645,
  0.5731109332771458],
 'Zoniferol': [53.236506551593415,
  51.818479325,
  48.53335538938606,
  6.966588504381901,
  0.5163978968332167],
 'Ketapril': [55.23563764047869,
  53.698742644999996,
  68.55357711244596,
  8.279708757706757,
  0.6038598237739696],
 'Propriva': [52.39346338487179,
  50.909964985,
  43.138803497801035

In [415]:
#display this way
drug_stats_dict_long = pd.DataFrame.from_dict(drug_stats, orient = 'index', columns= ['mean', 'median', 'variance', 'std', 'SEM'])
drug_stats_dict_long

Unnamed: 0,mean,median,variance,std,SEM
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Propriva,52.393463,50.909965,43.138803,6.568014,0.525862
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466


In [508]:
drug_stats_dict_long.columns

Index(['mean', 'median', 'variance', 'std', 'SEM'], dtype='object')

In [524]:
#or that way
drug_stats_dict_wide = pd.DataFrame(drug_stats, index = ['mean', 'median', 'variance', 'std, dev.', 'SEM'])
drug_stats_dict_wide

Unnamed: 0,Ramicane,Capomulin,Infubinol,Placebo,Ceftamin,Stelasyn,Zoniferol,Ketapril,Propriva,Naftisol,sem
mean,40.216745,40.675741,52.884795,54.033581,52.591172,54.233149,53.236507,55.235638,52.393463,54.331565,1.778232
median,40.673236,41.557809,51.820584,52.288934,51.776157,52.431737,51.818479,53.698743,50.909965,52.509285,
variance,23.486704,24.947764,43.128684,61.168083,39.290177,59.450562,48.533355,68.553577,43.138803,66.173479,
"std, dev.",4.846308,4.994774,6.567243,7.821003,6.268188,7.710419,6.966589,8.279709,6.568014,8.134708,
SEM,0.320955,0.329346,0.492236,0.581331,0.469821,0.573111,0.516398,0.60386,0.525862,0.596466,


In [525]:
#Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# This method produces everything in a single groupby function.



In [526]:
#this was much, much easier. My word
drug_stats_df = mouse_merge_clean_df[['Drug Regimen','Tumor Volume (mm3)']]

#drug_stats_df.drop(columns = ['min', '25%', '50%', '75%'], inplace = True)
#mouse_merge_clean_df.groupby(by = 'Drug Regimen').sem()
#drug_stats_df.append(sem)

In [527]:
drug_stats_df

Unnamed: 0,Drug Regimen,Tumor Volume (mm3)
0,Ramicane,45.000000
1,Ramicane,38.825898
2,Ramicane,35.014271
3,Ramicane,34.223992
4,Ramicane,32.997729
...,...,...
1888,Naftisol,63.145652
1889,Naftisol,65.841013
1890,Naftisol,69.176246
1891,Naftisol,70.314904


In [528]:
drug_stats_summary = drug_stats_df.groupby(by = 'Drug Regimen').sem()

In [529]:
drug_stats_summary.rename(columns = {'Tumor Volume (mm3)' : 'SEM'}, inplace = True)

In [530]:
drug_stats_describe = drug_stats_df.groupby(by = 'Drug Regimen').describe()


In [531]:
drug_stats_describe.columns = drug_stats_describe.columns.droplevel()

In [532]:
drug_stats_summary = drug_stats_summary.join(drug_stats_describe)

In [533]:
drug_stats_median = drug_stats_summary.join(drug_stats_df.groupby(by = 'Drug Regimen').median()).rename(columns = {'Tumor Volume (mm3)':'median'})

In [534]:
drug_stats_median = drug_stats_median.join(drug_stats_df.groupby(by = 'Drug Regimen').var()).rename(columns = {'Tumor Volume (mm3)':'variance'})

In [535]:
drug_stats_median

Unnamed: 0_level_0,SEM,count,mean,std,min,25%,50%,75%,max,median,variance
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Capomulin,0.329346,230.0,40.675741,4.994774,23.343598,37.685933,41.557809,45.0,48.158209,41.557809,24.947764
Ceftamin,0.469821,178.0,52.591172,6.268188,45.0,47.208427,51.776157,56.801438,68.923185,51.776157,39.290177
Infubinol,0.492236,178.0,52.884795,6.567243,36.321346,47.312353,51.820584,57.314444,72.226731,51.820584,43.128684
Ketapril,0.60386,188.0,55.235638,8.279709,45.0,48.232987,53.698743,60.870951,78.567014,53.698743,68.553577
Naftisol,0.596466,186.0,54.331565,8.134708,45.0,47.285874,52.509285,59.963034,76.668817,52.509285,66.173479
Placebo,0.581331,181.0,54.033581,7.821003,45.0,47.459053,52.288934,59.916934,73.212939,52.288934,61.168083
Propriva,0.525862,156.0,52.393463,6.568014,45.0,47.046068,50.909965,56.491585,72.455421,50.909965,43.138803
Ramicane,0.320955,228.0,40.216745,4.846308,22.050126,36.674635,40.673236,45.0,47.622816,40.673236,23.486704
Stelasyn,0.573111,181.0,54.233149,7.710419,45.0,48.047139,52.431737,58.719297,75.12369,52.431737,59.450562
Zoniferol,0.516398,182.0,53.236507,6.966589,45.0,47.337876,51.818479,57.954259,73.324432,51.818479,48.533355


In [536]:
drug_stats_summary_new = drug_stats_median.drop(columns = ['max','count','min', '25%', '50%', '75%'])

In [537]:
drug_stats_summary_new

Unnamed: 0_level_0,SEM,mean,std,median,variance
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,0.329346,40.675741,4.994774,41.557809,24.947764
Ceftamin,0.469821,52.591172,6.268188,51.776157,39.290177
Infubinol,0.492236,52.884795,6.567243,51.820584,43.128684
Ketapril,0.60386,55.235638,8.279709,53.698743,68.553577
Naftisol,0.596466,54.331565,8.134708,52.509285,66.173479
Placebo,0.581331,54.033581,7.821003,52.288934,61.168083
Propriva,0.525862,52.393463,6.568014,50.909965,43.138803
Ramicane,0.320955,40.216745,4.846308,40.673236,23.486704
Stelasyn,0.573111,54.233149,7.710419,52.431737,59.450562
Zoniferol,0.516398,53.236507,6.966589,51.818479,48.533355


In [540]:
drug_stats_summary_new.columns = ['mean', 'median', 'variance', 'std', 'SEM']
drug_stats_summary_new

Unnamed: 0_level_0,mean,median,variance,std,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,0.329346,40.675741,4.994774,41.557809,24.947764
Ceftamin,0.469821,52.591172,6.268188,51.776157,39.290177
Infubinol,0.492236,52.884795,6.567243,51.820584,43.128684
Ketapril,0.60386,55.235638,8.279709,53.698743,68.553577
Naftisol,0.596466,54.331565,8.134708,52.509285,66.173479
Placebo,0.581331,54.033581,7.821003,52.288934,61.168083
Propriva,0.525862,52.393463,6.568014,50.909965,43.138803
Ramicane,0.320955,40.216745,4.846308,40.673236,23.486704
Stelasyn,0.573111,54.233149,7.710419,52.431737,59.450562
Zoniferol,0.516398,53.236507,6.966589,51.818479,48.533355


## Bar Plots

In [523]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pandas.

In [10]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pyplot.

## Pie Plots

In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 


In [14]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and Scatter Plots

In [15]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [16]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen



## Correlation and Regression

In [17]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
