## Observations and Insights 

In [48]:
%matplotlib inline
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

mouse_df = mouse_metadata.rename(columns={'Mouse ID':'mouse_id','Drug Regimen':'drug_regimen','Sex':'sex','Age_months':'age_months','Weight (g)':'weight'})
study_df = study_results.rename(columns={'Mouse ID':'mouse_id','Timepoint':'timepoint','Tumor Volume (mm3)':'tumor_volume','Metastatic Sites':'metastatic_sites'})
study_df.head()
#study_results['Mouse ID'].nunique()
#study_results.nunique()

Unnamed: 0,mouse_id,timepoint,tumor_volume,metastatic_sites
0,b128,0,45.0,0
1,f932,0,45.0,0
2,g107,0,45.0,0
3,a457,0,45.0,0
4,c819,0,45.0,0


In [49]:
# Combine the data into a single dataset
df = pd.merge(mouse_df, study_df, how="left", on=["mouse_id", "mouse_id"])
df.head()
# Display the data table for preview

Unnamed: 0,mouse_id,drug_regimen,sex,age_months,weight,timepoint,tumor_volume,metastatic_sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [68]:
# Checking the number of mice.
df.count()

mouse_id            1893
drug_regimen        1893
sex                 1893
age_months          1893
weight              1893
timepoint           1893
tumor_volume        1893
metastatic_sites    1893
dtype: int64

In [75]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
#ids = df["mouse_id","timepoint"]
#df[ids.isin(ids[ids.duplicated()])]

results = df.groupby(["mouse_id","timepoint"]).size()
results = results[results > 1]
results

mouse_id  timepoint
g989      0            2
          5            2
          10           2
          15           2
          20           2
dtype: int64

In [54]:
df.describe()

Unnamed: 0,age_months,weight,timepoint,tumor_volume,metastatic_sites
count,1893.0,1893.0,1893.0,1893.0,1893.0
mean,12.81458,25.662441,19.572108,50.448381,1.021659
std,7.189592,3.921622,14.07946,8.894722,1.137974
min,1.0,15.0,0.0,22.050126,0.0
25%,7.0,25.0,5.0,45.0,0.0
50%,13.0,27.0,20.0,48.951474,1.0
75%,20.0,29.0,30.0,56.2922,2.0
max,24.0,30.0,45.0,78.567014,4.0


In [55]:
# Optional: Get all the data for the duplicate mouse ID. 
#df = df[study_df.duplicated(subset=['mouse_id','timepoint'], keep=False)]
#print (df)
df.value_counts(subset=['mouse_id','timepoint'],normalize=True)

mouse_id  timepoint
g989      20           0.001057
          15           0.001057
          10           0.001057
          5            0.001057
          0            0.001057
                         ...   
i334      20           0.000528
          15           0.000528
          10           0.000528
          5            0.000528
z969      45           0.000528
Length: 1888, dtype: float64

In [56]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = df.drop_duplicates(subset=['mouse_id','timepoint'])
clean_df.describe()

Unnamed: 0,age_months,weight,timepoint,tumor_volume,metastatic_sites
count,1888.0,1888.0,1888.0,1888.0,1888.0
mean,12.792903,25.661547,19.597458,50.449276,1.023835
std,7.186737,3.926776,14.084762,8.904565,1.138507
min,1.0,15.0,0.0,22.050126,0.0
25%,7.0,25.0,5.0,45.0,0.0
50%,13.0,27.0,20.0,48.951421,1.0
75%,20.0,29.0,30.0,56.324075,2.0
max,24.0,30.0,45.0,78.567014,4.0


In [57]:
clean_df.head()

Unnamed: 0,mouse_id,drug_regimen,sex,age_months,weight,timepoint,tumor_volume,metastatic_sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [58]:
# Checking the number of mice in the clean DataFrame.
df['mouse_id'].nunique()

249

## Summary Statistics

In [7]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.



In [8]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [9]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pandas.



In [10]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pyplot.



In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
