In [436]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st
%matplotlib notebook

In [437]:
# Study data files
mouse_metadata_path = "/Users/tony/Documents/GitHub/The Power of Plots/Resources/Mouse_metadata.csv"
study_results_path = "/Users/tony/Documents/GitHub/The Power of Plots/Resources/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)


In [438]:
mouse_dt_df = pd.DataFrame(mouse_metadata)
stdy_result_df = pd.DataFrame(study_results)

In [439]:
mouse_dt_df.columns

Index(['Mouse ID', 'Drug Regimen', 'Sex', 'Age_months', 'Weight (g)'], dtype='object')

In [440]:
mouse_dt_df['Drug Regimen'].value_counts()

Ramicane     25
Capomulin    25
Infubinol    25
Placebo      25
Ceftamin     25
Zoniferol    25
Ketapril     25
Propriva     25
Naftisol     25
Stelasyn     24
Name: Drug Regimen, dtype: int64

In [441]:
mouse_dt_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16


In [442]:
stdy_result_df.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.0,0
1,f932,0,45.0,0
2,g107,0,45.0,0
3,a457,0,45.0,0
4,c819,0,45.0,0


In [443]:
# Combine the data into a single dataset
mouse_merge_df = mouse_dt_df.merge(stdy_result_df,how='outer',left_on='Mouse ID', right_on = 'Mouse ID')

In [444]:
# Display the data table for preview
mouse_merge_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [445]:
 # Checking the number of mice.
mouse_merge_df['Mouse ID'].count()

1893

In [446]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
timepoint_groupby =mouse_merge_df.groupby('Mouse ID')
timepoint_groupby['Timepoint'].value_counts()

Mouse ID  Timepoint
a203      0            1
          5            1
          10           1
          15           1
          20           1
                      ..
z969      25           1
          30           1
          35           1
          40           1
          45           1
Name: Timepoint, Length: 1888, dtype: int64

In [160]:
# Optional: Get all the data for the duplicate mouse ID. 


In [161]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
# Checking the number of mice in the clean DataFrame.
mouse_clean_df = mouse_merge_df.drop_duplicates(subset=['Mouse ID'], keep = 'last')
mouse_clean_df['Mouse ID'].count()

249

In [162]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method is the most straighforward, creating multiple series and putting them all together at the end.

mouse_clean_regimen = mouse_clean_df.groupby(['Drug Regimen'])
drug_mean_df = mouse_clean_regimen['Tumor Volume (mm3)'].mean()

In [163]:
drug_median_df = mouse_clean_regimen['Tumor Volume (mm3)'].median()

In [164]:
drug_std_df = mouse_clean_regimen['Tumor Volume (mm3)'].std()

In [165]:
drug_var_df = mouse_clean_regimen['Tumor Volume (mm3)'].var()

In [166]:
mouse_clean_regimen['Tumor Volume (mm3)'].sem

<bound method GroupBy.sem of <pandas.core.groupby.generic.SeriesGroupBy object at 0x7fb607ca86d0>>

In [167]:
summary_table = {
    'drug mean':drug_mean_df,
    'drug median':drug_median_df,
    'drug variance':drug_var_df,
    'drug std deviation':drug_std_df
}
summary_table_df = pd.DataFrame(summary_table)
summary_table_df

Unnamed: 0_level_0,drug mean,drug median,drug variance,drug std deviation
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Capomulin,36.667568,38.125164,32.663378,5.715188
Ceftamin,57.753977,59.851956,69.982735,8.365568
Infubinol,58.178246,60.16518,74.010875,8.602957
Ketapril,62.806191,64.487812,98.92133,9.94592
Naftisol,61.205757,63.283288,106.029927,10.297083
Placebo,60.508414,62.030594,78.759797,8.874672
Propriva,56.736964,55.84141,69.349002,8.327605
Ramicane,36.19139,36.561652,32.166354,5.671539
Stelasyn,61.001707,62.19235,90.331586,9.504293
Zoniferol,59.181258,61.840058,76.862027,8.767099


In [168]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function


In [447]:
mouse_clean_df['Sex'].value_counts()

Mouse ID  Drug Regimen  Sex   
a203      Infubinol     Female    1
a251      Infubinol     Female    1
a262      Placebo       Female    1
a275      Ceftamin      Female    1
a366      Stelasyn      Female    1
                                 ..
z435      Propriva      Female    1
z578      Ramicane      Male      1
z581      Infubinol     Female    1
z795      Naftisol      Female    1
z969      Naftisol      Male      1
Name: Sex, Length: 249, dtype: int64

In [448]:
%matplotlib notebook

In [179]:
#Bar and Pie Charts
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 
treatment = ['Capomulin','Ceftamin','Infubinol','Ketapril','Naftisol','Placebo','Propriva','Ramicane','Stelasyn','Zoniferol']
y_axis = len(mouse_clean_df['Mouse ID'])

plt.bar(treatment, y_axis, color = 'b',alpha = 0.5, align = 'center')

<BarContainer object of 10 artists>

In [174]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.


In [175]:
# Generate a pie plot showing the distribution of female versus male mice using pandas


In [192]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
gender = ["Female mice", "Male mice"]
members = [124,125]
x_axis = np.arange(0, len(gender))
colors = ["yellowgreen", "red"]
explode = (0,0.02)
plt.axis("equal")

(-1.1017955498513006,
 1.1000855023738714,
 -1.1219343119157665,
 1.1027685115166932)

In [193]:
plt.pie(members, explode = explode, labels = gender, colors = colors, autopct="%1.1f%%")

<IPython.core.display.Javascript object>

([<matplotlib.patches.Wedge at 0x7fb5f604c790>,
  <matplotlib.patches.Wedge at 0x7fb5f604cf50>],
 [Text(0.006939175976765795, 1.0999781124353174, 'Female mice'),
  Text(-0.007065342812707128, -1.119977714479596, 'Male mice')],
 [Text(0.0037850050782358875, 0.5999880613283548, '49.8%'),
  Text(-0.00391117191417716, -0.6199876633726333, '50.2%')])

In [429]:
#Quartiles, Outliers and Boxplots
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin


# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint

In [459]:
mouse_merge_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [463]:
mouse_merge_df[['Mouse ID','Drug Regimen','Tumor Volume (mm3)']].head()
#mouse_merge_df.groupby(['Mouse ID','Tumor Volume (mm3)'])


Unnamed: 0,Mouse ID,Drug Regimen,Tumor Volume (mm3)
0,k403,Ramicane,45.0
1,k403,Ramicane,38.825898
2,k403,Ramicane,35.014271
3,k403,Ramicane,34.223992
4,k403,Ramicane,32.997729


In [425]:
# Put treatments into a list for for loop (and later for plot labels)

# Create empty list to fill with tumor vol data (for plotting)

# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    # Locate the rows which contain mice on each drug and get the tumor volumes
       
    # add subset 
        
    # Determine outliers using upper and lower bounds
    

In [383]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

In [19]:
# Line and Scatter Plots
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

In [20]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

In [None]:
# Correlation and Regression
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen