In [28]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
%matplotlib notebook

# Study data files
mouse_path = "Resources/mouse.csv"
study_path = "Resources/study.csv"

# Read the mouse data and the study results
mouse = pd.read_csv(mouse_path)
study = pd.read_csv(study_path)

# Merge two dataframes using an outer join
merge_df = pd.merge(mouse, study, on="Mouse ID", how="outer")

# Display the data table for preview
merge_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [29]:
# Checking the number of mice.
merge_df["Mouse ID"].count()

1893

In [66]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
dupes_mice = merge_df[merge_df.duplicated(['Mouse ID', 'Timepoint'])]

#display
dupes_mice["Mouse ID"].unique()

array(['g989'], dtype=object)

In [76]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID. delete rows with mouse Id = g989
# Get names of indexes 
indexNames = merge_df[merge_df['Mouse ID'] == 'g989' ].index
# Delete these row indexes from dataFrame
merge_df.drop(indexNames , inplace=True)
merge_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [77]:
# Checking the number of mice in the clean DataFrame.
merge_df["Mouse ID"].count()

1880

In [78]:
#checking to see what it looks like
merge_df


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [79]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.

#find the names of drugs to check
drugs = merge_df["Drug Regimen"].unique()
drugs

#assign variable to each formula to call on later in summary table
#ramicane = purchase_data_renamed_df.groupby("Gender")
drug_mean = merge_df.groupby("Drug Regimen").mean()["Tumor Volume (mm3)"]
drug_mean

drug_median = merge_df.groupby("Drug Regimen").median()["Tumor Volume (mm3)"]
drug_median

drug_var = merge_df.groupby("Drug Regimen").var()["Tumor Volume (mm3)"]
drug_var

drug_std = merge_df.groupby("Drug Regimen").std()["Tumor Volume (mm3)"]
drug_std

drug_sem = merge_df.groupby("Drug Regimen").sem()["Tumor Volume (mm3)"]
drug_sem

#summary table of all variables
summary_df = pd.DataFrame({"Tumor Volume (mm3) Mean": drug_mean,
                                      "Tumor Volume (mm3) Median": drug_median,
                                      "Tumor Volume (mm3) Variance": drug_var,
                                      "Tumor Volume (mm3) Standard Deviation": drug_std,
                                      "Tumor Volume (mm3) Standard Error of Mean": drug_sem})
summary_df

Unnamed: 0_level_0,Tumor Volume (mm3) Mean,Tumor Volume (mm3) Median,Tumor Volume (mm3) Variance,Tumor Volume (mm3) Standard Deviation,Tumor Volume (mm3) Standard Error of Mean
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.32093,50.446266,43.852013,6.622085,0.544332
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [92]:
#Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.

#PANDAS
#create new data frame
drug_regimen = merge_df["Drug Regimen"].value_counts()
pandas_bar_df = pd.DataFrame({"Drug Regimen": drugs,
                                      "Counts": drug_regimen})


# Filter the DataFrame down only to those columns to chart
#drugs_timepoints = merge_df["Drug Regimen"]

# Set the index to be "Drug Regimen" so they will be used as labels
pandas_bar_df = pandas_bar_df.set_index("Drug Regimen")
pandas_bar_df

# Use DataFrame.plot() in order to create a bar chart of the data
pandas_bar_df.plot(kind="bar", figsize=(10,10))

# Set a title for the chart
plt.title("Total Number of Timepoints for all Mice Tested")

plt.show()
plt.tight_layout()

#drug_regimen = merge_df["Drug Regimen"].value_counts()


#plt.bar(drug_regimen.index.values, drug_regimen.values, color="b", align="center")



<IPython.core.display.Javascript object>

In [102]:
#MATPLOTLIB
#create new data frame
drug_regimen_matplotlib = merge_df["Drug Regimen"].value_counts()
matplotlib_bar_df = pd.DataFrame({"Drug Regimen": drugs,
                                      "Counts": drug_regimen_matplotlib})
# Set x axis and tick locations
x_axis = np.arange(len(matplotlib_bar_df))
tick_locations = [value for value in x_axis]

# Create a list indicating where to write x labels and set figure size to adjust for space
plt.figure(figsize=(20,3))
plt.bar(x_axis, matplotlib_bar_df["Counts"], color='r', alpha=0.5, align="center")
plt.xticks(tick_locations, matplotlib_bar_df["Drug Regimen"], rotation="vertical")

# Set x and y limits
plt.xlim(-0.75, len(x_axis))
plt.ylim(0, max(matplotlib_bar_df["Counts"])+10)

 # Set a Title and labels
plt.title("Total Number of Timepoints for all Mice Tested")
plt.xlabel("Drug Regimen")
plt.ylabel("Count")

 #show the graph
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

In [112]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
#create a new data frame
gender_pandas = merge_df["Sex"].unique()
gender_count_pandas = merge_df["Sex"].value_counts()
gender_pandas_df = pd.DataFrame({"Gender": gender_pandas,
                                    "Counts": gender_count_pandas})
index = merge_df["Sex"].unique()
#plt.title("Gender of Tested Mice")
plot = gender_pandas_df.plot.pie(y='Counts', figsize=(2,2))

<IPython.core.display.Javascript object>

In [104]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
gender = merge_df["Sex"].unique()
gender_count = merge_df["Sex"].value_counts()
colors = ["yellow","green","lightblue","orange","red","purple","pink","yellowgreen","lightskyblue","lightcoral"]
explode = (0.1,0)

# Tell matplotlib to create a pie chart based upon the above data
plt.pie(gender_count, explode=explode, labels=gender, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=140)

# Create axes which are equal so we have a perfect circle
plt.axis("equal")

#Set title
plt.title("Gender of Tested Mice")

# Save an image of our chart and print the final product to the screen
plt.show()

<IPython.core.display.Javascript object>