## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as sts
import numpy as np

# Study data files
mouse_metadata_path = "Mouse_metadata.csv"
study_results_path = "Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mouse_results_df = pd.merge(mouse_metadata, study_results, on="Mouse ID")

# Display the data table for preview
mouse_results_df.head()


In [None]:
#test ={"Drugs": ["Ramicane", "Naftol"],
        #"Tumor Volume": [mouse_results_df.iloc[:,6]]}
#test_df=pd.DataFrame(test, columns=["Drugs", "Tumor Volume"])

In [None]:
#Checking for duplicate time id's
mouse_results_df["Mouse ID"].value_counts()

In [None]:
# Checking the number of mice.
mouse_results_df["Mouse ID"].count()

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 


In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_data = mouse_results_df.drop_duplicates(subset=["Mouse ID" ,"Timepoint"], keep ="first")
clean_data.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
clean_data["Mouse ID"].count()

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
summary_data = clean_data[["Mouse ID", "Drug Regimen", "Sex", "Timepoint", "Tumor Volume (mm3)"]]
summary_data_df=summary_data.groupby("Drug Regimen")
summary_data_df.describe()

In [None]:
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 

# mean, median, variance, standard deviation, and SEM of the tumor volume.
mean_stat=summary_data_df["Tumor Volume (mm3)"].mean()
median_stat=summary_data_df["Tumor Volume (mm3)"].median()
std_dev_stat=summary_data_df["Tumor Volume (mm3)"].std()
variance_stat=summary_data_df["Tumor Volume (mm3)"].var()
sample_stat=summary_data_df["Tumor Volume (mm3)"].sem()

# Assemble the resulting series into a single summary dataframe.
statistical_df=pd.DataFrame({"Mean": mean_stat,
                            "Median": median_stat,
                            "Std Dev": std_dev_stat,
                            "Variance": variance_stat,
                            "SEM": sample_stat})

statistical_df.head(10)


In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
for label, row in statistical_df.iterrows():
    print(label)
    print(row)

In [None]:
# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.
bar=summary_data.groupby("Drug Regimen")["Tumor Volume (mm3)"].sum().plot.bar(x="Tumor Volume (mm3)", rot=90)


In [None]:
# suggestion from Eric
summary_data.groupby("Drug Regimen").count()

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.
measurements_df = summary_data.groupby("Drug Regimen").count()
measurements_df.plot(kind="bar")

plt.title("Measurements per Drug Regimen")
plt.show()
plt.tight_layout()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
pie_df = summary_data.groupby("Sex")["Sex"].count()
pie_df.plot.pie(y="Male", figsize=(10,6))

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
labels="Male", "Female"
sizes=[958, 930]
explode=(0, 0.1)

fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%', shadow=True, startangle=360)
ax1.axis("equal")
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:
# Capomulin, Ramicane, Infubinol, and Ceftamin
drugs_df=summary_data.set_index("Drug Regimen")
target_drugs_df=drugs_df.loc[["Capomulin", "Ramicane", "Infubinol","Ceftamin"], ["Mouse ID", "Timepoint", "Tumor Volume (mm3)"]]
#target_drugs_df


# Start by getting the last (greatest) timepoint for each mouse

mice_df=target_drugs_df
target_mice_df=mice_df.groupby("Mouse ID").max()
#target_mice_df


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
final_data_df=pd.merge(target_drugs_df, target_mice_df, on="Mouse ID", how="left")

final_data_df


In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatments = ["Capomulin", "Raamicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)

tumor_vol=[]

# Calculate the IQR and quantitatively determine if there are any potential outliers. 

quartiles=tumor_vol.quartile([0.25, 0.5, 0.75])
lowerq=quartiles[0.25]
upperq=quartiles[0.75]
irq=upperq-lowerq
lower_bound=lowerq-(1.5*iqr)
upper_bound=upperq+(1.5*iqr)



    # Locate the rows which contain mice on each drug and get the tumor volumes
    #for tumor_vol in tumor_volume:
        
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
print(f"Values below {lower_bound} could be outliers")
print(f"Values above {upper_bound} could be outliers")    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig1, ax1=plt.subplots()
ax1.set_title("Total Tumor Volume (mm3)")
ax1.set_ylabel("Volume (mm3)")
ax1.boxplot(tumor_vol)
plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
target_drugs_df.reset_index()
capomulin_results_df=target_drugs_df.loc["Capomulin", "Timepoint": "Tumor Volume (mm3)"]
#capomulin_results_df
x_axis=capomulin_results_df["Tumor Volume (mm3)"]
y_axis=target_drugs_df["Timepoint"]
plt.plot(x_axis, y_axis)


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
scatter_data_df=target_drugs_df[["Timepoint", "Tumor Volume (mm3)"]]
scatter_data_df=scatter_data_df.loc["Capomulin", "Tumor Volume (mm3)"].mean()
x_values=scatter_data_df["Tumor Volume (mm3)"]
y_values=scatter_data_df["Timepoint"]
plt.scatter(x_values, y_values)
plt.xlabel("Tumor Volume (mm3)")
plt.y_label("Timepoint")
plt.show()


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
x_values=scatter_data_df["Tumor Volume (mm3)"]
y_values=scatter_data_df["Timepoint"]
(slope, intertecept, rvalue, pvalue, stderr)=linergress(x_values, y_values)
regress_values=xvalues*slope+intercept
line_eq="y="+str(round(slope, 2))+"x +"+str(round(intercept, 2))
plt.scatter(x_values, y_values)
plt.plot(x_values,regress_values, "r-")
plt.annotate(line_eq, (6,10), fontsize=15, color="green")
plt.xlabel("Tumor Volume (mm3)")
plt.ylabel("Timepoints")
plt.show()